#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""Extract FOAF rdf from Orkut.com
2004/02/17 started by 'andy'
I'm just doing my hobby, everything personal, sorry
"""
# orkut urls
orkut = {}
orkut['host'] = 'www.orkut.com'
orkut['port'] = 80
orkut['site_root'] = 'http://www.orkut.com/'
orkut['login'] = '/Login.aspx'
orkut['logout'] = '/Logout.aspx'
orkut['home'] = '/Home.aspx'
orkut['profile'] = '/Profile.aspx'
orkut['friend_list'] = '/FriendsList.aspx'
orkut['xmlns'] = 'http://www.orkut.com/xmlns/'
import re, httplib, Cookie, StringIO
import exceptions, os
class OrkutError( exceptions.RuntimeError ):
def __init__( self, args ):
self.args = args
return
class OrkutPerson:
def __init__( self, uid, name ):
self.uid = uid
self.name = name
self.attribute = {}
self.crawled = 0
self.friends = []
return
def setPhotoPath( self, path ):
self.photoPath = path;
return
def getPhotoPath( self ):
return self.photoPath;
def __setitem__( self, key, value ):
self.attribute[key] = value
return
def __getitem__( self, key ):
return self.attribute[key]
def printFOAF( self, persons, basePath ):
out = StringIO.StringIO()
d = { 'uid' : self.uid, 'name' : self.name, 'base' : basePath, 'xmlns' : orkut['xmlns'] }
out.write( """
\t
\t
\t\t
\t
\t
\t\t%(uid)s
\t\t%(name)s
\t\t
""" % d )
if self.attribute.has_key('webpage'):
out.write( "\t\t\n" % self.attribute['webpage'] )
if self.attribute.has_key('email'):
out.write( "\t\t\n" % self.attribute['email'] )
if self.attribute.has_key('IM'):
val = self.attribute['IM']
id, service = self.attribute['IM'].split(',')
service = service.strip()
if service=='MSN':
out.write( "\t\t%s\n" % id )
# attrs
for a in self.attribute:
v = self.attribute[a]
key = a.replace( ' ', '_' )
out.write( "\t\t%s\n" % (key,v,key) )
# friends
for f in self.friends:
fd = {
'uid' : f.uid,
'name' : f.name,
'base' : basePath
}
out.write( """
\t\t
\t\t\t
\t\t\t\t%(uid)s
\t\t\t\t%(name)s
\t\t\t\t
\t\t\t
\t\t
""" % fd )
out.write( """\t
""" )
return out.getvalue()
class OrkutDriver:
""" Orkutへのログイン、HTMLのダウンロードを拁Eするクラス """
def __init__( self ):
self.cookie = 0
self.connection = httplib.HTTPConnection( orkut['host'], orkut['port'] )
#self.connection.set_debuglevel( 1 )
return
def _GetHeader( self, referer, keepalive=1 ):
attrs = {}
if referer:
attrs['Referer'] = referer
if self.cookie:
attrs['Cookie'] = self.cookie.output( [], "" )
if keepalive:
attrs['Connection'] = 'keep-alive'
return attrs
def login( self, user, passwd ):
result = 0
self.user = user
self.passwd = passwd
# login
body = "u=%s&p=%s&Submit.x=0&Submit.y=0" % (user, passwd);
attrs = self._GetHeader('')
attrs['Content-Type'] = 'application/x-www-form-urlencoded'
self.connection.request( 'POST', orkut['login'], body, attrs )
r = self.connection.getresponse()
self.cookie = Cookie.SimpleCookie()
self.cookie.load( r.getheader('Set-Cookie') )
if not (r.status==302 and r.getheader('Location')==orkut['home']):
return 0
# Home - extract my uid
self.connection.request( 'GET', orkut['home'], '', self._GetHeader('') )
r = self.connection.getresponse()
if r.status!=200:
return 0
mo = re.search( '"ProfileC.aspx\\?uid=([^"]*)"', r.read(int(r.getheader('Content-Length'))) )
if not mo:
return 0
self.uid = mo.group(1)
return 1
def logout( self ):
self.connection.request( 'GET', orkut['logout'], '', self._GetHeader('',0) )
r = self.connection.getresponse()
return r.status==200
def getProfile( self, uid ):
self.connection.request( 'GET', "%s?uid=%s" % (orkut['profile'], uid), '', self._GetHeader('') )
r = self.connection.getresponse()
if r.status!=200:
raise OrkutError, "can't get profile"
return r.read(int(r.getheader('Content-Length')))
def getFriendList( self, uid ):
self.connection.request( 'GET', "%s?uid=%s" % (orkut['friend_list'], uid), '', self._GetHeader('') )
r = self.connection.getresponse()
if r.status!=200:
raise OrkutError, "can't get friend list"
return r.read(int(r.getheader('Content-Length')))
def getPhoto( self, path ):
if path[0]!='/':
path = '/' + path
self.connection.request( 'GET', path, '', self._GetHeader('') )
r = self.connection.getresponse()
if r.status!=200:
raise OrkutError, "can't get photo"
return r.read(int(r.getheader('Content-Length')))
def parseFriendList( src ):
mo = re.search( '(.*)', src )
if not mo:
raise OrkutError, "invalid friend list"
buf = mo.group(1)
friendRe = re.compile( ']*>([^<]*)' )
pos = 0
list = []
while 1:
mo = friendRe.search( buf, pos )
if not mo:
break
list.append( OrkutPerson( mo.group(1), mo.group(2) ) )
# next
pos = mo.end()
return list
def parseProfile( src, person ):
# photo
mo = re.search( '
', src )
if mo:
person.setPhotoPath( mo.group(1) )
# profile
profileRe = re.compile( '
]*>([^:]*): | ([^<]*) | ' )
pos = 0
while 1:
mo = profileRe.search( src, pos )
if not mo:
break
person[mo.group(1)] = mo.group(2)
# next
pos = mo.end()
verbose = 1
def crawlUser( od, persons, uid, person, outpath ):
if verbose: print "Crawl %s(%s)" % (person.name, uid)
person.crawled = 1
if verbose: print "\tprofile..."
parseProfile( od.getProfile( uid ), person )
if verbose: print "\tfriend list...",
friends = parseFriendList( od.getFriendList( uid ) )
person.friends = friends
if verbose: print " %d friends found" % len(friends)
for f in friends:
if verbose: print "\t%s(%s)" % (f.name, f.uid)
if persons.has_key( f.uid ):
continue
persons[f.uid] = f
def fuxx( user, passwd, initialUID, outpath, level, base ):
od = OrkutDriver()
if verbose: print "login"
if not od.login( user, passwd ):
return
persons = {}
if not initialUID:
persons[od.uid] = OrkutPerson( od.uid, user )
else:
persons[initialUID] = OrkutPerson( initialUID, "John Doe" )
for l in range(level):
if verbose: print "start level %d" % l
ids = persons.keys()
for id in ids:
p = persons[id]
if p.crawled:
continue
try:
crawlUser( od, persons, p.uid, p, outpath )
open( outpath+p.uid+".jpg", "wb" ).write( od.getPhoto( p.getPhotoPath() ) )
open( outpath+p.uid+".rdf", "wt" ).write( p.printFOAF( persons, base ) )
except Exception, e:
print e
if verbose: print "crawl user %s failed." % p.uid
od.logout()
def cli():
"""Command-line interface (looks at sys.argv to decide what to do)."""
import getopt, os, sys
class BadUsage: pass
user = ''
passwd = ''
initial = ''
level = 1
outpath = './'
base = ''
try:
opts, args = getopt.getopt(sys.argv[1:], 'o:l:u:s:r:')
writing = 0
for opt, val in opts:
if opt=='-o':
outpath = val
elif opt=='-l':
try:
level = int(val)
except ValueError:
raise BadUsage
elif opt=='-u':
initial = val
elif opt=='-s':
verbose = 0
elif opt=='-r':
base = val
if not args or len(args)<2:
raise BadUsage
user = args[0]
passwd = args[1]
if not user or not passwd:
raise BadUsage
fuxx( user, passwd, initial, outpath, level, base )
except (getopt.error, BadUsage):
print """orkutfuxx - fuxx orkut and bear FOAF
Usage : %s [OPTION] user passwd
-o out put directory
-l extracting level
-u start uid
-r site root
-s silent
""" % sys.argv[0]
if __name__=='__main__':
cli()