Scraping Facebook Private Messages

Facebook allows you to download a dump of your data, which is supposed to include private messages. For some reason it was extremely incomplete for me (less than 1% of a particular thread with about 10000 messages). This Python script will scrape a particular private message/chat thread. Expect maybe half an hour for a long thread.

Issues:

  • Facebook keeps changing their layout, login method, etc. This will obviously break the script.
  • Currently unimplemented: a way to dump all private messages, with everyone.
  • Facebook seems to sometimes detect rapid automated-looking behaviour like this and refuse a connection.
  • Older messages display date but not time. I decided to ignore time data for consistency.

Idiot’s guide: install Python 2 if on Windows and save the script as “dump.py”. Get Beautiful Soup 4 and place /bs4 in the same directory. Go find the token ID of the message thread you want to dump (look for something like “tid=id.12345” in the URL). Then, run

python dump.py email@email.com password tokenid

substituting in your own arguments. You now have a new directory that will contain all the dumped HTML pages (you can delete these if you want), a progress file that lets you resume a partial dump, and an XML file with your parsed message dump.

#!/usr/bin/python

import sys
import os
import re
import urllib
import urllib2
import cookielib
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import xml.sax.saxutils

def escape(string,escapeTable={}):
	return xml.sax.saxutils.escape(string,escapeTable).encode("ascii", "xmlcharrefreplace")

def parseDate(string):
	#hopefully I havent missed any date formats
	now = datetime.now()
	formats = [
		("%b %d, %Y",''),
		("%b %d, %Y",", %d" % now.year),
		("%b %d at %I:%M%p, %Y",''),
		("%b %d at %I:%M%p, %Y",", %d" % now.year)]
	for f in formats:
		try:
			return datetime.strptime(string.strip()+f[1],f[0]).date()
		except ValueError:
			continue
	weekday = string.split()[0]
	for day in range(7):
		testDay = now - timedelta(days=day)
		if weekday == "Yesterday" and day == 1:
			return testDay.date()
		if datetime.strftime(testDay,"%A") == weekday:
			return testDay.date()
	return now.date()

def parseMessage(soup):
	for i in soup.select("strong"):
		#strip name and subject
		i.clear()
	for i in soup.select("div"):
		try:
			i['data-sigil']
			if not i.br:
				i.append("\n")
		except KeyError: pass
	for i in soup.select("br"):
		i.replace_with('\n')
	#return soup.get_text().strip()
	return re.sub(u'\xad','',re.sub("\n ",'\n',soup.get_text().strip())) #im not sure what the weird unicode is

def main():
	#check arguments
	if len(sys.argv) != 4:
		usage()
	user = sys.argv[1]
	password = sys.argv[2]
	tid = sys.argv[3]

	#initialize url opener
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
	opener.addheaders = [('User-agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1')] #facebook formats pages differently in each browser
	urllib2.install_opener(opener)

	#login to facebook
	data = urllib.urlencode({'email' : user, 'pass' : password})
	page = opener.open('https://m.facebook.com/login.php?m=m&refsrc=http%3A%2F%2Fm.facebook.com%2F&refid=8', data)
	if not re.search('Logout', page.read()):
		print 'Login Failed'
		sys.exit(1)

	#save to a message-specific directory
	if not os.path.isdir(tid):
		os.makedirs(tid)
	counter = 1
	while os.path.isfile(tid + "/dump%d.xml"%counter):
		counter += 1
	fileHandler = open(tid + "/dump%d.xml"%counter, 'a')
	fileHandler.write('<!--?xml version="1.0" ?-->')
	fileHandler.close()

	try:
		progressData = [line for line in open(tid+"/progress")]
		pageNo = int(progressData[0])
		url =  progressData[1]
	except IOError:
		pageNo = 1
		url = 'http://m.facebook.com/messages/read/?deleteselected=0&start=0&page_size=7&tids=%s&sk=inbox&tid=%s&see_older=1&refid=12' % (tid, tid)
	page = opener.open(url)
	html = page.read()

	while 1:
		if not url: break
		page = opener.open(url)
		url = None
		html = page.read()
		soup = BeautifulSoup(html)
		fileHandler = open(tid+"/page%d.html"%pageNo, 'w')
		fileHandler.write(html)
		fileHandler.close()
		pageNo += 1

		#loop through elements in thread div
		for div in soup.select("#messageGroup")[0].find_all("div",recursive=False):
			if div.get("id") == "see_older_messages":
				continue
			if div.get("id") == "see_newer_messages":
				fileHandler = open(tid+"/progress", 'w')
				url = "http://m.facebook.com" + div.a.get('href')
				fileHandler.write(str(pageNo) + "\n" + url)
				fileHandler.close()
				break

			#this is an actual message, let's parse it
			messageTag = div.select("div")[0] #get rid of any link blurb or whatever
			strong = messageTag.select("strong")
			author = strong[0].string
			subject = strong[1].string if len(strong) > 1 else ''
			date = str(parseDate(div.select("div")[-1].abbr.string.strip()))
			message = parseMessage(messageTag)

			fileHandler = open(tid + "/dump%d.xml"%counter, 'a')
			fileHandler.write('\n<![CDATA[\n' % (escape(author, {'"' : "&quot;"}), escape(subject, {'"' : "&quot;"}), date)) 			fileHandler.write(escape(message)) 			fileHandler.write("\n]]>")
			fileHandler.close()

def usage():
	print 'Usage: ' + sys.argv[0] + ' email password tid'
	sys.exit(2)

if __name__ == '__main__':
	main()
Advertisements
This entry was posted in Uncategorized. Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s