#!/usr/local/bin/python # # Yahoo Group (NEO) message retrieval script # # Usage: python yg.py [user=pass] GroupName [StartMess [EndMess]] # # Specify GroupName (and optional starting message number and ending message number to retrive) # # Use your Yahoo username and password if the group is restricted # # # This can be easily adopted to convert messages to forum posts, mbox format, whatever. # # Note: This script stops after 20 consecutive skipped (deleted, etc) messages. You can modify this number. # # # written by nkom@rocketmail.com # import sys, os, urllib2, re, time, urllib from cookielib import LWPCookieJar ### Configuration ### DBG = 0 # Set this to 1 if you want to see headers, cookies, and raw file, etc user="YahooLogin" # You need to set username and password for restricted group, here, passwd="YahooPass" # or supply at run time sleep = 3 # How many seconds to sleep after fetching one message retry = 3 # retry counts for bad connection skip = 20 # number of consecutive skipped messages before to stop operation messages = 10000 # How many mesages to retrieve if end message arg isn't supplied head = """
""" # HTML header foot = """""" # footer try: import ygr_conf except: pass cj = LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) ### You may need to adjust these, in some cases ### opener.addheaders = [('ser-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Language', 'en-us'), ('Connection', 'keep-alive')] uopen = opener.open if DBG: print sys.argv[1] if len(sys.argv) > 1 and "=" in sys.argv[1]: # username=password combo is supplied i = sys.argv[1].find("=") user = sys.argv[1][:i] passwd = sys.argv[1][i+1:] sys.argv.pop(1) if len(sys.argv) > 1: # Setting group name g= sys.argv[1] else: print "Enter the name of Yahoo group to retrieve messages, please!" sys.exit(2) try: os.mkdir(g) except: pass ### Logging ### log = open(g + "_yg-log.txt","a") def lw(s): log.write(s + "\r\n") lw(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) if len(sys.argv) > 2: # Set starting message number i= int(sys.argv[2]) else: try: i = int(open(g + "_last").read()) -3 # If it's not supplied, try to use the last successful message - 3 except: i= 1 if len(sys.argv) > 3: # last message to retrieve n= int(sys.argv[3]) else: n= i + messages # if not supplied, get default number of messages if i > n: n = i lw("Starting message = %d, retrieving %d messages" %(i, n - i)) u="http://groups.yahoo.com/neo/groups/%s/conversations/messages/" % (g,) mlink = 'href="http://groups.yahoo.com/neo/groups/%s/conversations/messages/' % (g,) ### Prepare Cookie ### cjf = g + '_cookie.txt' try: cj.load(cjf) except: print "Failed to load cookies" ### Main loop ### xcount = 0 start = i mdir = g rcount = 0 while i <= n: try: uh = uopen(u + str(i)) # connect to the server except: print "Failed to connect", rcount # if it failed to connect: if rcount-2 > retry: # abort if retry count exceeds the limit print "Aborting" lw("Failed to connect. Aborting at %d" % (i,)) break rcount = rcount + 1 continue # otherwise retry rcount = 0 s = uh.read() # get the HTML data if i > 9999: # when message num is over 9999, put files in different directory si = str(i) si = si[:len(si)-4] mdir = "%s-%s" % (g, si) # it will be groupname-1 for 10000 to 19999, for example. if not os.path.exists(mdir): os.mkdir(mdir) if DBG: file(mdir + os.path.sep + str(i) + "-raw.html","w").write(head + s + foot) ic = s.find('') # find the message content section if ic > 0 and i == start: # If the message is empty, login is required m = "Probably member login is required" lw(m) print m if DBG: for hd in uh.headers.headers: print hd, for c in cj: print c print "=======" iul = s.find("GROUPS.LOGIN_URL = ") + 20 # find the url for login screen iul2 = s.find('"', iul + 5) ul = s[iul:iul2] print ul suh = uopen(ul) sul = suh.read() if DBG: for hd in suh.headers.headers: print hd, for c in cj: print c print "=======" file(mdir + os.path.sep + str(i) + "-ul.html","w").write(sul) iul = sul.find("action=") + 8 # find url to submit login data iul2 = sul.find('"', iul + 5) ul = sul[iul:iul2] print ul iul = sul.find('', iul2 + 5) # determin the section where post form data is if iul < 0: iul = sul.find('''