#!/usr/bin/env python # -*- coding: UTF-8 -*- """Extract FOAF rdf from Orkut.com 2004/02/17 started by 'andy' I'm just doing my hobby, everything personal, sorry """ # orkut urls orkut = {} orkut['host'] = 'www.orkut.com' orkut['port'] = 80 orkut['site_root'] = 'http://www.orkut.com/' orkut['login'] = '/Login.aspx' orkut['logout'] = '/Logout.aspx' orkut['home'] = '/Home.aspx' orkut['profile'] = '/Profile.aspx' orkut['friend_list'] = '/FriendsList.aspx' orkut['xmlns'] = 'http://www.orkut.com/xmlns/' import re, httplib, Cookie, StringIO import exceptions, os class OrkutError( exceptions.RuntimeError ): def __init__( self, args ): self.args = args return class OrkutPerson: def __init__( self, uid, name ): self.uid = uid self.name = name self.attribute = {} self.crawled = 0 self.friends = [] return def setPhotoPath( self, path ): self.photoPath = path; return def getPhotoPath( self ): return self.photoPath; def __setitem__( self, key, value ): self.attribute[key] = value return def __getitem__( self, key ): return self.attribute[key] def printFOAF( self, persons, basePath ): out = StringIO.StringIO() d = { 'uid' : self.uid, 'name' : self.name, 'base' : basePath, 'xmlns' : orkut['xmlns'] } out.write( """ \t \t \t\t \t \t \t\t%(uid)s \t\t%(name)s \t\t """ % d ) if self.attribute.has_key('webpage'): out.write( "\t\t\n" % self.attribute['webpage'] ) if self.attribute.has_key('email'): out.write( "\t\t\n" % self.attribute['email'] ) if self.attribute.has_key('IM'): val = self.attribute['IM'] id, service = self.attribute['IM'].split(',') service = service.strip() if service=='MSN': out.write( "\t\t%s\n" % id ) # attrs for a in self.attribute: v = self.attribute[a] key = a.replace( ' ', '_' ) out.write( "\t\t%s\n" % (key,v,key) ) # friends for f in self.friends: fd = { 'uid' : f.uid, 'name' : f.name, 'base' : basePath } out.write( """ \t\t \t\t\t \t\t\t\t%(uid)s \t\t\t\t%(name)s \t\t\t\t \t\t\t \t\t """ % fd ) out.write( """\t """ ) return out.getvalue() class OrkutDriver: """ Orkutへのログイン、HTMLのダウンロードを拁Eするクラス """ def __init__( self ): self.cookie = 0 self.connection = httplib.HTTPConnection( orkut['host'], orkut['port'] ) #self.connection.set_debuglevel( 1 ) return def _GetHeader( self, referer, keepalive=1 ): attrs = {} if referer: attrs['Referer'] = referer if self.cookie: attrs['Cookie'] = self.cookie.output( [], "" ) if keepalive: attrs['Connection'] = 'keep-alive' return attrs def login( self, user, passwd ): result = 0 self.user = user self.passwd = passwd # login body = "u=%s&p=%s&Submit.x=0&Submit.y=0" % (user, passwd); attrs = self._GetHeader('') attrs['Content-Type'] = 'application/x-www-form-urlencoded' self.connection.request( 'POST', orkut['login'], body, attrs ) r = self.connection.getresponse() self.cookie = Cookie.SimpleCookie() self.cookie.load( r.getheader('Set-Cookie') ) if not (r.status==302 and r.getheader('Location')==orkut['home']): return 0 # Home - extract my uid self.connection.request( 'GET', orkut['home'], '', self._GetHeader('') ) r = self.connection.getresponse() if r.status!=200: return 0 mo = re.search( '"ProfileC.aspx\\?uid=([^"]*)"', r.read(int(r.getheader('Content-Length'))) ) if not mo: return 0 self.uid = mo.group(1) return 1 def logout( self ): self.connection.request( 'GET', orkut['logout'], '', self._GetHeader('',0) ) r = self.connection.getresponse() return r.status==200 def getProfile( self, uid ): self.connection.request( 'GET', "%s?uid=%s" % (orkut['profile'], uid), '', self._GetHeader('') ) r = self.connection.getresponse() if r.status!=200: raise OrkutError, "can't get profile" return r.read(int(r.getheader('Content-Length'))) def getFriendList( self, uid ): self.connection.request( 'GET', "%s?uid=%s" % (orkut['friend_list'], uid), '', self._GetHeader('') ) r = self.connection.getresponse() if r.status!=200: raise OrkutError, "can't get friend list" return r.read(int(r.getheader('Content-Length'))) def getPhoto( self, path ): if path[0]!='/': path = '/' + path self.connection.request( 'GET', path, '', self._GetHeader('') ) r = self.connection.getresponse() if r.status!=200: raise OrkutError, "can't get photo" return r.read(int(r.getheader('Content-Length'))) def parseFriendList( src ): mo = re.search( '(.*)', src ) if not mo: raise OrkutError, "invalid friend list" buf = mo.group(1) friendRe = re.compile( ']*>([^<]*)' ) pos = 0 list = [] while 1: mo = friendRe.search( buf, pos ) if not mo: break list.append( OrkutPerson( mo.group(1), mo.group(2) ) ) # next pos = mo.end() return list def parseProfile( src, person ): # photo mo = re.search( '', src ) if mo: person.setPhotoPath( mo.group(1) ) # profile profileRe = re.compile( ']*>([^:]*):([^<]*)' ) pos = 0 while 1: mo = profileRe.search( src, pos ) if not mo: break person[mo.group(1)] = mo.group(2) # next pos = mo.end() verbose = 1 def crawlUser( od, persons, uid, person, outpath ): if verbose: print "Crawl %s(%s)" % (person.name, uid) person.crawled = 1 if verbose: print "\tprofile..." parseProfile( od.getProfile( uid ), person ) if verbose: print "\tfriend list...", friends = parseFriendList( od.getFriendList( uid ) ) person.friends = friends if verbose: print " %d friends found" % len(friends) for f in friends: if verbose: print "\t%s(%s)" % (f.name, f.uid) if persons.has_key( f.uid ): continue persons[f.uid] = f def fuxx( user, passwd, initialUID, outpath, level, base ): od = OrkutDriver() if verbose: print "login" if not od.login( user, passwd ): return persons = {} if not initialUID: persons[od.uid] = OrkutPerson( od.uid, user ) else: persons[initialUID] = OrkutPerson( initialUID, "John Doe" ) for l in range(level): if verbose: print "start level %d" % l ids = persons.keys() for id in ids: p = persons[id] if p.crawled: continue try: crawlUser( od, persons, p.uid, p, outpath ) open( outpath+p.uid+".jpg", "wb" ).write( od.getPhoto( p.getPhotoPath() ) ) open( outpath+p.uid+".rdf", "wt" ).write( p.printFOAF( persons, base ) ) except Exception, e: print e if verbose: print "crawl user %s failed." % p.uid od.logout() def cli(): """Command-line interface (looks at sys.argv to decide what to do).""" import getopt, os, sys class BadUsage: pass user = '' passwd = '' initial = '' level = 1 outpath = './' base = '' try: opts, args = getopt.getopt(sys.argv[1:], 'o:l:u:s:r:') writing = 0 for opt, val in opts: if opt=='-o': outpath = val elif opt=='-l': try: level = int(val) except ValueError: raise BadUsage elif opt=='-u': initial = val elif opt=='-s': verbose = 0 elif opt=='-r': base = val if not args or len(args)<2: raise BadUsage user = args[0] passwd = args[1] if not user or not passwd: raise BadUsage fuxx( user, passwd, initial, outpath, level, base ) except (getopt.error, BadUsage): print """orkutfuxx - fuxx orkut and bear FOAF Usage : %s [OPTION] user passwd -o out put directory -l extracting level -u start uid -r site root -s silent """ % sys.argv[0] if __name__=='__main__': cli()