Mercurial > ~darius > hgwebdir.cgi > scrape-vb
diff scrape-vb.py @ 1:8045db05180b SCRAPEVB_1_0
Initial revision
author | darius |
---|---|
date | Sat, 25 Aug 2007 05:17:29 +0000 (2007-08-25) |
parents | |
children | 89232ea0c3d4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scrape-vb.py Sat Aug 25 05:17:29 2007 +0000 @@ -0,0 +1,158 @@ +#!/usr/bin/env python + +############################################################################ +# Screen scraper for Virgin Blue to look for happy hour deals +# +# Prints out (and emails) when criteria match based on cost, +# destination, etc +# +# $Id: scrape-vb.py,v 1.1.1.1 2007/08/25 05:17:29 darius Exp $ +############################################################################ +# +# Copyright (C) 2007 Daniel O'Connor. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +############################################################################ + +import re, BeautifulSoup, datetime, time, smtplib, sys, urllib + +#### Configuration + +### Travel criteria +# Supported keys are email, when, city1, city2, maxcost +# email is mandatory. If city2 is not present either city will be +# matched. when and maxcost are optional (will match for any date or +# cost) +travellers = [ + { 'email' : 'darius@dons.net.au', 'city1' : 'Sydney' }, + { 'email' : 'sarah.mahoney@nehta.gov.au', 'city1' : 'Adelaide', 'city2' : 'Brisbane' }, + ] + +### Mail host +mailhost = 'mail.dons.net.au' + +### Who the email is from +mailfrom = 'darius@dons.net.au' + +### What's onn the subject linee +mailsubj = 'Virgin Blue Happy Hour Deals' + +### Actually send email? +mailsend = False + +### URL to parse +vburl = 'http://virginblue.com.au' + +parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) +parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) + +try: + #f = open("vb-happyhour.html") + f = urllib.urlopen(vburl) +except IOError, e: + print "Unable to fetch page - " + str(e) + sys.exit(1) + +s = BeautifulSoup.BeautifulSoup(f) +hrr = s.find("ul", "happyhr-rows") +if (hrr == None): + print "No happy hour details found" + sys.exit(0) + +hrlist = hrr.findAll("li") + +# XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it +# doesn't work +times = parsetper.match(s.findAll('ul')[11].find('li').string) +if (times == None): + print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) + sys.exit(0) + +frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) +totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) + +#print "Travel from %s to %s" % (str(frtime), str(totime)) + +output = {} +for i in hrlist: + href = i.find('a') + match = parsetitle.match(href['title']) + if (match == None): + print "Unable to match " + str(s) + continue + + city1 = match.group(1) + city2 = match.group(2) + cost = int(match.group(3)) + url = href['href'] + + for t in travellers: + if ('email' not in t): + print "No email key found, configuration error?" + continue + + citymatch = True + if ('city1' in t and 'city2' in t): + if((t['city1'] != city1 or t['city2'] != city2) and + (t['city1'] != city2 or t['city2'] != city1)): + citymatch = False + elif ('city1' in t): + if (t['city1'] != city1 and t['city1'] != city2): + citymatch = False + + datematch = True + if ('when' in t): + travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) + if (travtime < frtime or travtime > totime): + datematch = False + + costmatch = True + if ('maxcost' in t): + if (cost > int(t['maxcost'])): + costmatch = False + + if (citymatch and datematch and costmatch): + if (t['email'] not in output): + + output[t['email']] = [] + output[t['email']].append([city1, city2, cost, url]) + +if (mailsend): + server = smtplib.SMTP(mailhost) + #server.set_debuglevel(1) + +for o in output: + msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)) + msg = msg + "Your criteria for flights have been matched\r\n\r\n" + print "Sending email to " + o + for i in output[o]: + print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) + msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) + + msg = msg + "\r\nNote: travel period is from %s to %s" % \ + (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) + if (mailsend): + server.sendmail(mailfrom, o, msg) + else: + print msg + print