Mercurial > ~darius > hgwebdir.cgi > scrape-vb
view scrape-vb.py @ 1:8045db05180b SCRAPEVB_1_0
Initial revision
author | darius |
---|---|
date | Sat, 25 Aug 2007 05:17:29 +0000 (2007-08-25) |
parents | |
children | 89232ea0c3d4 |
line wrap: on
line source
#!/usr/bin/env python ############################################################################ # Screen scraper for Virgin Blue to look for happy hour deals # # Prints out (and emails) when criteria match based on cost, # destination, etc # # $Id: scrape-vb.py,v 1.1.1.1 2007/08/25 05:17:29 darius Exp $ ############################################################################ # # Copyright (C) 2007 Daniel O'Connor. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # ############################################################################ import re, BeautifulSoup, datetime, time, smtplib, sys, urllib #### Configuration ### Travel criteria # Supported keys are email, when, city1, city2, maxcost # email is mandatory. If city2 is not present either city will be # matched. when and maxcost are optional (will match for any date or # cost) travellers = [ { 'email' : 'darius@dons.net.au', 'city1' : 'Sydney' }, { 'email' : 'sarah.mahoney@nehta.gov.au', 'city1' : 'Adelaide', 'city2' : 'Brisbane' }, ] ### Mail host mailhost = 'mail.dons.net.au' ### Who the email is from mailfrom = 'darius@dons.net.au' ### What's onn the subject linee mailsubj = 'Virgin Blue Happy Hour Deals' ### Actually send email? mailsend = False ### URL to parse vburl = 'http://virginblue.com.au' parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) try: #f = open("vb-happyhour.html") f = urllib.urlopen(vburl) except IOError, e: print "Unable to fetch page - " + str(e) sys.exit(1) s = BeautifulSoup.BeautifulSoup(f) hrr = s.find("ul", "happyhr-rows") if (hrr == None): print "No happy hour details found" sys.exit(0) hrlist = hrr.findAll("li") # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it # doesn't work times = parsetper.match(s.findAll('ul')[11].find('li').string) if (times == None): print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) sys.exit(0) frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) #print "Travel from %s to %s" % (str(frtime), str(totime)) output = {} for i in hrlist: href = i.find('a') match = parsetitle.match(href['title']) if (match == None): print "Unable to match " + str(s) continue city1 = match.group(1) city2 = match.group(2) cost = int(match.group(3)) url = href['href'] for t in travellers: if ('email' not in t): print "No email key found, configuration error?" continue citymatch = True if ('city1' in t and 'city2' in t): if((t['city1'] != city1 or t['city2'] != city2) and (t['city1'] != city2 or t['city2'] != city1)): citymatch = False elif ('city1' in t): if (t['city1'] != city1 and t['city1'] != city2): citymatch = False datematch = True if ('when' in t): travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) if (travtime < frtime or travtime > totime): datematch = False costmatch = True if ('maxcost' in t): if (cost > int(t['maxcost'])): costmatch = False if (citymatch and datematch and costmatch): if (t['email'] not in output): output[t['email']] = [] output[t['email']].append([city1, city2, cost, url]) if (mailsend): server = smtplib.SMTP(mailhost) #server.set_debuglevel(1) for o in output: msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)) msg = msg + "Your criteria for flights have been matched\r\n\r\n" print "Sending email to " + o for i in output[o]: print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) msg = msg + "\r\nNote: travel period is from %s to %s" % \ (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) if (mailsend): server.sendmail(mailfrom, o, msg) else: print msg print