Mercurial > ~darius > hgwebdir.cgi > scrape-vb
view scrape-vb.py @ 9:3e03facad74b default tip
New example files for latest layout.
author | darius |
---|---|
date | Thu, 18 Oct 2007 06:58:00 +0000 |
parents | d17fd6f3a492 |
children |
line wrap: on
line source
#!/usr/bin/env python ############################################################################ # Screen scraper for Virgin Blue to look for happy hour deals # # Prints out (and emails) when criteria match based on cost, # destination, etc # # $Id: scrape-vb.py,v 1.7 2007/10/18 06:57:35 darius Exp $ ############################################################################ # # Copyright (C) 2007 Daniel O'Connor. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # ############################################################################ import os, re, BeautifulSoup, datetime, time, smtplib, sys, urllib import ConfigParser, optparse, SMSVodaAu usage = '''%prog [options] Reads configuration from ./scrape-vb.ini and ~/.scrape-vb.ini''' optparse = optparse.OptionParser(usage, version="$Id: scrape-vb.py,v 1.7 2007/10/18 06:57:35 darius Exp $") optparse.add_option('-d', '--debug', action="store_true", default=False, help="Disable mail & SMS sending, prints message to stdout") optparse.add_option('-f', '--file', help="Do not fetch the page, use this file instead") optparse.add_option('-e', '--example', action="store_true", default=False, help="Print an example configuration file to stdout and exit") (options, args) = optparse.parse_args() if (options.example): print '''[global] mailsubj="Subject line for emails" # The following 3 options are necessary before email will be sent mailfrom=user@host.com mailsend=True mailhost=mail.server.com smsuser=0412312312 smspass=mys3krit smssend=True [user@host.com] # All fields are optional city1=Foo city2=Bar when=dd/mm/yy maxcost=123 phone=0498765432 ''' sys.exit(0) parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) conf = ConfigParser.ConfigParser() conf.add_section('global') conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') conf.set('global', 'vburl', 'http://virginblue.com.au') conflist = ['scrape-vb.ini'] if ('HOME' in os.environ): conflist.append(os.path.expanduser('~/.scrape-vb.ini')) conf.read(conflist) try: if (options.file != None): f = open(options.file) else: f = urllib.urlopen(conf.get('global', 'vburl')) except IOError, e: print "Unable to fetch page - " + str(e) sys.exit(1) # Test if we have been configured to send SMSs try: smsuser = conf.get('global', 'smsuser') smspass = conf.get('global', 'smspass') smssend = conf.getboolean('global', 'smssend') except ConfigParser.NoOptionError: smssend = False if (options.debug == True and smssend): print "smssend overridden due to debugging" smssend = False if (smssend): smshndl = SMSVodaAu.SMSVodaAu(smsuser, smspass) s = BeautifulSoup.BeautifulSoup(f) citypairs = s.findAll("td", "city-pair") if (citypairs == []): print "No happy hour details found" sys.exit(0) prices = s.findAll("td", "dash-r price") if (prices == []): print "Couldn't find prices" sys.exit(0) if (len(citypairs) != len(prices)): print "City pair & price tables don't have equal size" sys.exit(0) times = parsetper.search(s.find('p', 'tandc').string) if (times == None): print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) sys.exit(0) frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) # # Go through the HTML and work out who wants to be notified of what # # Store in output, a dictionary keyed by email adddress which holds a # list of each matching flight (city1, city2, cost, url) # output = {} for i, p in zip(citypairs, prices): href = i.find('a') city1 = href.next.strip() city2 = href.next.next.next.next.next.strip() cost = int(p.find('a').string.strip('$^ ')) url = href['href'] for email in conf.sections(): if (email == 'global'): continue # Stuff configuration into a dictionary for our convenience t = {'email' : email} for i in conf.items(email): t[i[0]] = i[1] citymatch = True if ('city1' in t and 'city2' in t): if((t['city1'] != city1 or t['city2'] != city2) and (t['city1'] != city2 or t['city2'] != city1)): citymatch = False elif ('city1' in t): if (t['city1'] != city1 and t['city1'] != city2): citymatch = False datematch = True if ('when' in t): travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) if (travtime < frtime or travtime > totime): datematch = False costmatch = True if ('maxcost' in t): if (cost > int(t['maxcost'])): costmatch = False if (citymatch and datematch and costmatch): if (t['email'] not in output): output[t['email']] = [] output[t['email']].append([city1, city2, cost, url]) # Test if we have been configured to send email try: mailsubj = conf.get('global', 'mailsubj') mailhost = conf.get('global', 'mailhost') mailsend = conf.getboolean('global', 'mailsend') mailfrom = conf.get('global', 'mailfrom') except ConfigParser.NoOptionError: mailsend = False if (options.debug == True and mailsend): print "mailsend overridden due to debugging" mailsend = False if (mailsend): server = smtplib.SMTP(mailhost) #server.set_debuglevel(1) # # Output the various notifications # ttimestr = "Note: travel period is from %s to %s" % \ (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) # Email each person about their flights if (mailsend): for o in output: msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj) msg = msg + "Your criteria for flights have been matched\r\n\r\n" for i in output[o]: msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) msg = msg + "\r\n" + ttimestr + "\r\n" server.sendmail(mailfrom, o, msg) else: # If not emailing print to stdout for o in output: print "Match for " + o for i in output[o]: print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) # SMS each person about their flights if (smssend): for o in output: if (conf.has_option(o, 'phone')): pnum = conf.get(o, 'phone') msg = "" for i in output[o]: msg = msg + "%s <-> %s $%d, " % (i[0], i[1], i[2]) # Chop off the last , & make sure the whole message is not # too large. msgend = min(len(msg) - 2, 160) print msg[0:msgend] try: smshndl.sendamsg(pnum, msg[0:msgend]) print "Sent SMS to " + pnum except: print "Unable to send SMS to " + pnum