#!/usr/bin/python 

# Based on Blagg: the Blosxom RSS aggregator ( Author: Rael Dornfest <rael@oreilly.com> )

import sys
import string
import os
import stat
import re
import time
import StringIO
import shutil
import md5
import ConfigParser
import email.Utils
import email
import email.Encoders
from email.MIMEBase import MIMEBase        
import rssparser

from xml.dom import *
from xml.dom.minidom import parseString
import anydbm

# --- Configurable variables -----
datadir = "/arch/wlog/"
myAddress="costin@localhost"

useLocalRss=0  # Don't use net, use local cached rss
saveRss=1      # Save the RSS
saveItems=0    # Save items to files instead of mailing ( for debug or other purpose )
warnNoDate=0   # Print a warning for logs not having date
debug=0
############################## DOM utils 
def getText(nodelist):
    if not nodelist:
        return None
    rc = ""
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE or node.nodeType == node.CDATA_NODE:
            rc = rc + node.data
    return rc

def getContent(node):
    return getText(node.childNodes)

def findChild( node, childName ):
    if not node.childNodes:
        return None
    for chld in node.childNodes:
        #print "findChild ", chld 
        if chld.nodeName and chld.nodeName == childName:
            return chld
    return None

def getChildContent(node, childName):
    if not node:
        return None
    #print "getChildContent ", node, childName 
    chld=findChild(node, childName )
    if not chld:
        return None
    return getText(chld.childNodes)

#################### Date utils
# Constants
months = ( "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" )

def convertRSSDate( date ):
    (day,time)=string.split(date,"T")
    (year,mo,day)=string.split( day, "-")
    time= re.sub("Z","", time )
    #print "XXX " + year + " " + mo + " " + day
    if re.match("^[A-z]",mo ):
        return day + " " + mo + " " + year + " " + time
    return day + " " + months[int(mo)-1] + " " + year + " " + time

#################### Extracting date from content
def getDate( item, content, link ):
    date= getChildContent(item, "dc:date")
    # Special case ( Duncan ): no date, but it's in description
    if not date:
        p=re.search( "Posted: ([0-9]*)/([0-9]*)/([0-9]*) ([0-9]*):([0-9]*)", content)
        if p:
            date=p.group(1) + "-" + p.group(2) + "-" + p.group(3) + "T" + p.group(4) + ":" + p.group(5)
        if date:
            print "Posted " + link + " " + date
    # Ken Coar
    if not date:
        p=re.search( "Updated:[ A-z,]*([0-9]*) ([A-z]*) ([0-9]*) ([0-9]*):([0-9]*)", content)
        if p:
            date=p.group(3) + "-" + p.group(2) + "-" + p.group(1) + "T" + p.group(4) + ":" + p.group(5)
        if date:
            print "Update " + link + " " + date
    if not date:
        date=time.strftime("%Y-%m-%dT%H:%M:%SZ")
        if warnNoDate:
            print "NO DATE ", link, " ", date
    return date

#################### Object model
class channel:
    def __init__(self):
        self.nick=""
        self.url=""

    def __setstate__(self, dict):
        # print "Set state ", dict
        self.__dict__=dict
    
    def readData( self ):
        """ Read the RSS data for the channel. Will use the lastAccess to avoid loading the server """
        # XXX Timeout, use lastmodified
        if useLocalRss and os.access(datadir + self.nick + ".rss", os.F_OK):
            rssFile=file( datadir + self.nick + ".rss", "r" )
        else:
           #rssFile = rssparser.open_resource(uri, etag=etag, modified=modified, agent=agent, referrer=referrer)
           if None and dbChannelLast.has_key( self.nick ):
               print "LAST: ", dbChannelLast[self.nick]
               last=string.split(dbChannelLast[self.nick])
               rssFile = rssparser.open_resource(self.url, modified= last )
               self.lastAccess=rssparser.get_modified( rssFile )
               print "LastModified ", self.url, " ", self.lastAccess, " ", dbChannelLast[self.nick] 
           else:
               rssFile = rssparser.open_resource(self.url)
               self.lastAccess=rssparser.get_modified( rssFile )
               print "First access ", self.url, " ", self.lastAccess 
            
           dbChannelLast[self.nick]=str( self.lastAccess )

        rssData=rssFile.read()
        rssFile.close()

        if rssData and rssData!="" and saveRss:
            out=file(datadir + self.nick + ".rss", "w")
            out.write( rssData )
            out.close()
        return rssData

    def processChannelNode( self, channelNode ):
        if channelNode:
            self.title= getChildContent( channelNode, "title")
            self.link= getChildContent( channelNode, "link")
            self.description= getChildContent( channelNode, "description")
        else:
            self.title=self.nick
            self.link=self.url
            print "No channel node"


class item:
    def __init__(self):
        self.link=""
        self.date=""
        self.md5=""

    def __setstate__(self, dict):
        # print "Set state ", dict
        self.__dict__=dict

def loadUrl(source):
    import urllib2
    request = urllib2.Request(source)
#    if etag:
#        request.add_header("If-None-Match", etag)
#    if modified:
#        request.add_header("If-Modified-Since", format_http_date(modified))
    request.add_header("Accept-encoding", "gzip")
    try:
        return urllib2.urlopen(request)
    except urllib2.HTTPError:
        # either the resource is not modified or some other HTTP
        # error occurred so return an empty resource
        return StringIO.StringIO("")
    except:
        # source must not be a valid URL but it might be a valid filename
        pass


def updateChannelList(fileName ):
    """
    Get new channels from rss.dat and the cache. Channels.dat is used to
    keep track articles we already sent.
    """
    channels=dict()

    fh=file(fileName)
    for line in fh.readlines():
        # Cut trailing comment
        line = re.sub( "#.*$", "", line)
        if re.match( "^\s*$", line):
            continue
        (nick,url,mode)=string.split( line )

        ch=channel()
        ch.nick=nick
        ch.url=url
        ch.items=dict()
        channels[ nick ]=ch
        ch.options=string.split( mode, "," )
    return channels

def extractContent( itemN ):
    description= getChildContent(itemN, "description")
    content= getChildContent(itemN, "content:encoded")
    if not content:
        content=description

    if not content:
        content="No content"
    return content

def getImages( content ):
    """ Find all images in the content, return a list of tuples ( fileName, type, content )"""
    images=list()
    # Need some regexp here - using DOM is unlikely to work, the content may be
    # bad html ( the rss parser is fine since it is encoded )
    # XXX not ready
    return images


def sendMail( ch, it, content):
    #  Remove \n from title, ch.title
    subjectLine= re.sub( "\n", " ", it.title.encode("UTF8") )

    images=getImages(content)

    msg=MIMEBase("multipart","mixed")
    msg["Subject"]=subjectLine
    # should be blogg+ENCODED_LINK@DOMAIN -> for reply
    msg["From"]= ch.title + " <" + myAddress + "> (" + "\"blog-" + it.link + "\")"
    msg["Reply-To"]="\"blog-" + it.link + "\" <" + myAddress + "> (" + ch.title + ")"
    msg["Date"]=convertRSSDate(it.date)
    msg["X-MD5"]=it.md5
    msg["Message-ID"]=it.link
    msg["References"]=it.link
    msg["To"]="blog-in <" + myAddress + ">" 
    
    contentBody=MIMEBase("text", "html", charset="UTF8")
    out=StringIO.StringIO()
    out.write( "<a href='" + it.link + "'>" + it.title + "</a><br/><br/>" )
    out.write( content )
    contentBody.add_payload( out.getvalue().encode("UTF8") )
    
    msg.add_payload( contentBody )
    
    data=msg.as_string()

    if saveItems:
        fileName= re.sub( "\W", "_", it.title )
        fileName = datadir + "new/" + ch.nick + "." + fileName[0:15] + "-" + it.date  + ".txt"
        articleFile=open(fileName, "wb")
        articleFile.write( data )
    else:
        mail=os.popen( "/usr/bin/procmail -f " + myAddress, "w")
        mail.write( data)


def processItem( ch, itemN ):
    link=getChildContent(itemN, "link") 
    if not link:
        print "No link ", itemN.childNodes
        link="No link"

#    print "Loading ", link
#    cStream = loadUrl( link )
#    if cStream:
#        content = cStream.read()
#    else:
    content = extractContent( itemN )

    date=getDate( itemN, content, link )
    dbArticleLast[ link ]=date

    # Skip already seen article.
    # XXX Remove from our cache articles that aren't in the feed
    # Use MD5 to detect updates
    contentMD5=md5.new( content.encode("UTF8")).hexdigest()

    # We use the link as an ID ( it's "permalink", isn't it ?)
    if dbArticleMD5.has_key( link ):
        if  dbArticleMD5[link] == contentMD5:
            if debug>2: print "Already seen ", link
            return
        else:
            print "Modified ", link

    # New item
    it=item()
    it.channel=ch
    it.link=link
    it.date=date
    it.inFeed=1
    it.md5=contentMD5
    dbArticleMD5[link]=contentMD5
    ch.items[link]=it
    dbArticleChannel[link]=ch.nick

    it.title= getChildContent(itemN, "title")
    if not it.title:
        print "No title ", itemN
        it.title="No title"
    it.subject= getChildContent(itemN, "dc:subject")

    print "Processing ", ch.nick, " ", it.link, " ", it.date
    # print content
    sendMail(ch, it, content )

def processChannels( channels ):
    for ch in channels.itervalues():
        try:
            ## Load the .rss for the channel
            rssData=ch.readData()
            
            if not rssData or rssData=="":
                print "No data ",ch.url
                continue
            try:
                rssN = parseString( rssData )
            except Exception, detail:
                print "Error processing rss", Exception, " ", detail, " ", ch.url, "\n"
                continue
                

            # Mark all items as "not in feed". When an item is found, it is unmarked
            #if not hasattr(ch, "items"): ch.items=dict()
            #for it in ch.items.itervalues():
            #    it.inFeed=0

            channelNode = rssN.getElementsByTagName("channel" )[0]

            ch.processChannelNode( channelNode )

            for itemN in rssN.getElementsByTagName( "item" ):
                processItem( ch, itemN )
                
            rssN.unlink()

            # Remove from cache items that are no longer in the feed
            #old=list()
            #for it in ch.items.itervalues():
            #    if not hasattr(it, "inFeed") or not it.inFeed:
            #        old.append(it)
            #for it in old:
            #    print "Removing old ", it.link
            #    del ch.items[ it.link ]

            print "Processed " + ch.nick + " --> " + ch.url
            # Dump info after each iteration
        except IOError, detail:
            print "Error processing ", IOError, " ", detail, " ", ch.url, "\n" 
#        except Exception, detail:
#           print "Error processing ", Exception, " ", detail, " ", ch.url, "\n" 

#################### Main
if __name__ == '__main__':

    dbArticleMD5 = anydbm.open(datadir + '/article-md5.db', 'c')   # ArticleID -> MD5
    dbChannelLast = anydbm.open(datadir + '/channel-last.db', 'c') # channel -> last access time

    # Not used for now - will expire the old IDs
    dbArticleLast = anydbm.open(datadir + '/article-last.db', 'c') # Article ->last 
    dbArticleChannel = anydbm.open(datadir + '/article-channel.db', 'c') # Article->channel

    channels=updateChannelList(datadir + "rss.dat")
    processChannels( channels )
    
    dbArticleMD5.close()
    dbChannelLast.close()
    dbArticleLast.close()
    dbArticleChannel.close()
