#!/usr/local/bin/python
#
#  Yahoo Group (NEO) message retrieval script
#
#  Usage:  python yg.py [user=pass] GroupName [StartMess [EndMess]]
#
#  Specify GroupName (and optional starting message number and ending message number to retrive)
#
#     Use your Yahoo username and password if the group is restricted
#  
#
#  This can be easily adopted to convert messages to forum posts, mbox format, whatever.
#  
#  Note: This script stops after 20 consecutive skipped (deleted, etc) messages. You can modify this number.
#
#
# written by nkom@rocketmail.com
#

import sys, os, urllib2, re, time, urllib
from cookielib import LWPCookieJar

### Configuration ###

DBG = 0				# Set this to 1 if you want to see headers, cookies, and raw file, etc

user="YahooLogin"		# You need to set username and password for restricted group, here, 
passwd="YahooPass"		# or supply at run time

sleep = 3				# How many seconds to sleep after fetching one message
retry = 3			# retry counts for bad connection
skip = 20			# number of consecutive skipped messages before to stop operation
messages = 10000		# How many mesages to retrieve if end message arg isn't supplied

head = """<html><body>"""	# HTML header
foot = """</body></html>"""	# footer

try:
  import ygr_conf
except:
  pass

cj = LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

### You may need to adjust these, in some cases ###
opener.addheaders = [('ser-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Language', 'en-us'),  ('Connection', 'keep-alive')]
uopen = opener.open

if DBG: print sys.argv[1]

if len(sys.argv) > 1 and "=" in sys.argv[1]:	# username=password combo is supplied
  i = sys.argv[1].find("=")
  user = sys.argv[1][:i]
  passwd = sys.argv[1][i+1:]
  sys.argv.pop(1)

if len(sys.argv) > 1:				# Setting group name
  g= sys.argv[1]
else:
  print "Enter the name of Yahoo group to retrieve messages, please!"
  sys.exit(2)

try:
  os.mkdir(g)
except:
  pass

### Logging ###
log = open(g + "_yg-log.txt","a")
def lw(s):  log.write(s + "\r\n")
lw(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

if len(sys.argv) > 2:  				# Set starting message number
  i= int(sys.argv[2])
else:
  try:
    i = int(open(g + "_last").read()) -3	# If it's not supplied, try to use the last successful message - 3
  except:
    i= 1

if len(sys.argv) > 3:				# last message to retrieve
  n= int(sys.argv[3]) 
else:
  n= i + messages				# if not supplied, get default number of messages
if  i > n: n = i 

lw("Starting message = %d, retrieving %d messages" %(i, n - i))

u="http://groups.yahoo.com/neo/groups/%s/conversations/messages/" % (g,)
mlink = 'href="http://groups.yahoo.com/neo/groups/%s/conversations/messages/' % (g,)

### Prepare Cookie ###
cjf = g + '_cookie.txt'
try:
  cj.load(cjf)
except:
  print "Failed to load cookies"


###  Main loop ###
xcount = 0
start = i
mdir = g
rcount = 0

while i <= n:
  try:
    uh = uopen(u + str(i))		# connect to the server
  except:
    print "Failed to connect", rcount   # if it failed to connect:
    if rcount-2 > retry:		#   abort if retry count exceeds the limit
      print "Aborting"
      lw("Failed to connect. Aborting at %d" % (i,))
      break
    rcount = rcount + 1
    continue				#   otherwise retry

  rcount = 0
  s = uh.read()				# get the HTML data

  if i > 9999:				# when message num is over 9999, put files in different directory
    si = str(i)
    si = si[:len(si)-4]
    mdir = "%s-%s" % (g, si)		# it will be groupname-1  for 10000 to 19999, for example.
    if not os.path.exists(mdir): os.mkdir(mdir)
  
  if DBG: file(mdir + os.path.sep + str(i) + "-raw.html","w").write(head + s + foot)

  ic = s.find('<div class="msg-content"></div>')	# find the message content section
  if ic > 0 and i == start:                         	# If the message is empty, login is required
    m = "Probably member login is required"
    lw(m)
    print m

    if DBG:  
      for hd in uh.headers.headers:
        print hd,
      for c in cj: 
        print c
      print "======="

    iul = s.find("GROUPS.LOGIN_URL = ") + 20		# find the url for login screen
    iul2 = s.find('"', iul + 5)
    ul = s[iul:iul2]
    print ul
    suh = uopen(ul)
    sul = suh.read()

    if DBG: 
      for hd in suh.headers.headers:
        print hd,
      for c in cj: 
        print c
      print "======="
      file(mdir + os.path.sep + str(i) + "-ul.html","w").write(sul)

    iul = sul.find("action=") + 8			# find url to submit login data
    iul2 = sul.find('"', iul + 5)
    ul = sul[iul:iul2]
    print ul  
    iul = sul.find('</form>', iul2 + 5) 		# determin the section where post form data is
    if iul < 0:
      iul = sul.find('''<div id='inputs'>''', iul2 + 5)	# If we cannot find it, try another way
    sul = sul[iul2:iul]
    data = {"login":user,"passwd":passwd}		# We do supply username and password 
    iul2 = 0
    while 1:						# Loop for gathering post form data
      iul = sul.find("name=", iul2)			# find post data name
      if iul < 0: break					# exit the loop when there is no more name/value pair
      iul = iul + 6
      iul2 = sul.find('"', iul)
      name = sul[iul:iul2]
      if name == "login" or name == "id" or name == "password":	# skip username and password
        continue
      
      iul = sul.find("value=", iul2)			# find post data value
      if iul < 0: 
        v = ""
      else:
        iul = iul + 7
        iul2 = sul.find('"', iul)
        v = sul[iul:iul2]
      data[name] = v					# store the name and value in a dict
    if DBG: 
      print "===="
      print data

    suh2 = uopen(ul, urllib.urlencode(data))		# post everything
    sul = suh2.read()

    if DBG: 
      for hd in suh2.headers.headers:
        print hd,
      for c in cj: 
        print c
      print "======="
      file(mdir + os.path.sep + str(i) + "-id.html","w").write(sul)

    ic = sul.find('<div class="msg-content"></div>')	# check if message content exists
    if ic > 0:
      print "Trying", u + str(i)
      s = uopen(u + str(i)).read()			# if no message content, try message url
      ic = s.find('<div class="msg-content"></div>')
      if ic > 0:					# if we don't have it, probably we failed to login...
        print "Login failed"
        lw("Login failed")
        break
    elif sul.find('''<meta http-equiv="Refresh"''') > 0: # good sign, meta redirect for the message
      print "Login successful? Redirect to message URL"
      s = uopen(u + str(i)).read()			# connect and get the data
      if DBG: file(mdir + os.path.sep + str(i) + "-rd-raw.html","w").write(s)
    else:
      s = sul

  ### Message data handling ###
  try:  
    ia = s.index('<div class="msg-title">')		# find title
    #ia = s.index('</div>',ia)
  except:
    ia = 0

  try:
    ib = s.index('<div class="card-action-bar">')	
  except:
    ib = 0

  if ic > 0  or ia + ib == 0 or len(s) == 0:		# no title, no action bar ==> probably deleted message
    print str(i)+"x", xcount,
    sys.stdout.flush()
    lw(str(i) + " skipped")
    i = i + 1
    xcount = xcount + 1
    if xcount > skip: break				# if we skipped 20 messages, maybe we are at the end
    continue

  xcount = 0						# reset skip count if we found the mesage title

  s = s[ia:ib]						# Cut out all Yahoo junk
  s = re.sub("/neo/groups/%s/conversations/messages/(\d+)" % (g,), "\\1.html", s) # modify "next" and "previous" link
  s = s.replace('href="javascript:;',mlink + str(i))				  # modify "view source" url
  s = s.replace('>View Source</span>', '><a ' + mlink + str(i) + '">View Source</a></span>')

  file(mdir + os.path.sep + str(i) + ".html","w").write(head + s + foot)	# save the file
  #print s[ia:ib]
  print i,
  sys.stdout.flush()
  log.flush()

  if s.find("""<i class="yg-sprite tip  rt-dactv" alt="Next" pos="bottom">Next</i></span>""") > 0: 
    print "End of messages"					#
    break

  ia = s.find("<a href=")
  if ia > 0:
    ia = ia + 9
    ib = s.find('.', ia)
    n = int(s[ia:ib])
    if n-i > 1: print "(%d)" % (n-i),
    i = n
  else:
    i = i + 1

  time.sleep(sleep)						# sleep 1 second not to stress Yahoo server
  ### End of the main loop ###

if xcount > (skip -1): i = i - xcount
file(g + "_last","w").write(str(i))			# remember the last message so that we can restart from there

try:
  cj.save(cjf)						# save cookie
except:
  print "Cookies could not be saved"
lw("All done")
log.close()

