Scraping Facebook Private Messages

Facebook allows you to download a dump of your data, which is supposed to include private messages. For some reason it was extremely incomplete for me (less than 1% of a particular thread with about 10000 messages). This Python script will scrape a particular private message/chat thread. Expect maybe half an hour for a long thread.


  • Facebook keeps changing their layout, login method, etc. This will obviously break the script.
  • Currently unimplemented: a way to dump all private messages, with everyone.
  • Facebook seems to sometimes detect rapid automated-looking behaviour like this and refuse a connection.
  • Older messages display date but not time. I decided to ignore time data for consistency.

Idiot’s guide: install Python 2 if on Windows and save the script as “”. Get Beautiful Soup 4 and place /bs4 in the same directory. Go find the token ID of the message thread you want to dump (look for something like “tid=id.12345” in the URL). Then, run

python password tokenid

substituting in your own arguments. You now have a new directory that will contain all the dumped HTML pages (you can delete these if you want), a progress file that lets you resume a partial dump, and an XML file with your parsed message dump.


import sys
import os
import re
import urllib
import urllib2
import cookielib
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import xml.sax.saxutils

def escape(string,escapeTable={}):
	return xml.sax.saxutils.escape(string,escapeTable).encode("ascii", "xmlcharrefreplace")

def parseDate(string):
	#hopefully I havent missed any date formats
	now =
	formats = [
		("%b %d, %Y",''),
		("%b %d, %Y",", %d" % now.year),
		("%b %d at %I:%M%p, %Y",''),
		("%b %d at %I:%M%p, %Y",", %d" % now.year)]
	for f in formats:
			return datetime.strptime(string.strip()+f[1],f[0]).date()
		except ValueError:
	weekday = string.split()[0]
	for day in range(7):
		testDay = now - timedelta(days=day)
		if weekday == "Yesterday" and day == 1:
		if datetime.strftime(testDay,"%A") == weekday:

def parseMessage(soup):
	for i in"strong"):
		#strip name and subject
	for i in"div"):
			if not
		except KeyError: pass
	for i in"br"):
	#return soup.get_text().strip()
	return re.sub(u'\xad','',re.sub("\n ",'\n',soup.get_text().strip())) #im not sure what the weird unicode is

def main():
	#check arguments
	if len(sys.argv) != 4:
	user = sys.argv[1]
	password = sys.argv[2]
	tid = sys.argv[3]

	#initialize url opener
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
	opener.addheaders = [('User-agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1')] #facebook formats pages differently in each browser

	#login to facebook
	data = urllib.urlencode({'email' : user, 'pass' : password})
	page ='', data)
	if not'Logout',
		print 'Login Failed'

	#save to a message-specific directory
	if not os.path.isdir(tid):
	counter = 1
	while os.path.isfile(tid + "/dump%d.xml"%counter):
		counter += 1
	fileHandler = open(tid + "/dump%d.xml"%counter, 'a')
	fileHandler.write('<!--?xml version="1.0" ?-->')

		progressData = [line for line in open(tid+"/progress")]
		pageNo = int(progressData[0])
		url =  progressData[1]
	except IOError:
		pageNo = 1
		url = '' % (tid, tid)
	page =
	html =

	while 1:
		if not url: break
		page =
		url = None
		html =
		soup = BeautifulSoup(html)
		fileHandler = open(tid+"/page%d.html"%pageNo, 'w')
		pageNo += 1

		#loop through elements in thread div
		for div in"#messageGroup")[0].find_all("div",recursive=False):
			if div.get("id") == "see_older_messages":
			if div.get("id") == "see_newer_messages":
				fileHandler = open(tid+"/progress", 'w')
				url = "" + div.a.get('href')
				fileHandler.write(str(pageNo) + "\n" + url)

			#this is an actual message, let's parse it
			messageTag ="div")[0] #get rid of any link blurb or whatever
			strong ="strong")
			author = strong[0].string
			subject = strong[1].string if len(strong) > 1 else ''
			date = str(parseDate("div")[-1].abbr.string.strip()))
			message = parseMessage(messageTag)

			fileHandler = open(tid + "/dump%d.xml"%counter, 'a')
			fileHandler.write('\n<![CDATA[\n' % (escape(author, {'"' : "&quot;"}), escape(subject, {'"' : "&quot;"}), date)) 			fileHandler.write(escape(message)) 			fileHandler.write("\n]]>")

def usage():
	print 'Usage: ' + sys.argv[0] + ' email password tid'

if __name__ == '__main__':
This entry was posted in Uncategorized. Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s