Mercurial > ~darius > hgwebdir.cgi > scrape-vb
view scrape-vb.py @ 4:e3f4ef0b6e39
Oops, read URL from configuration like I planned.
author | darius |
---|---|
date | Mon, 27 Aug 2007 02:29:27 +0000 |
parents | 89232ea0c3d4 |
children | 275603a8e2ae |
line wrap: on
line source
#!/usr/bin/env python ############################################################################ # Screen scraper for Virgin Blue to look for happy hour deals # # Prints out (and emails) when criteria match based on cost, # destination, etc # # $Id: scrape-vb.py,v 1.3 2007/08/27 02:29:27 darius Exp $ ############################################################################ # # Copyright (C) 2007 Daniel O'Connor. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # ############################################################################ import re, BeautifulSoup, datetime, time, smtplib, sys, urllib, ConfigParser parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) conf = ConfigParser.ConfigParser() conf.add_section('global') conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') conf.set('global', 'vburl', 'http://virginblue.com.au') conf.read('scrape-vb.ini') try: #f = open("vb-happyhour.html") f = urllib.urlopen(conf.get('global', 'vburl')) except IOError, e: print "Unable to fetch page - " + str(e) sys.exit(1) s = BeautifulSoup.BeautifulSoup(f) hrr = s.find("ul", "happyhr-rows") if (hrr == None): print "No happy hour details found" sys.exit(0) hrlist = hrr.findAll("li") # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it # doesn't work times = parsetper.match(s.findAll('ul')[11].find('li').string) if (times == None): print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) sys.exit(0) frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) output = {} for i in hrlist: href = i.find('a') match = parsetitle.match(href['title']) if (match == None): print "Unable to match " + str(s) continue city1 = match.group(1) city2 = match.group(2) cost = int(match.group(3)) url = href['href'] for email in conf.sections(): if (email == 'global'): continue t = {'email' : email} for i in conf.items(email): t[i[0]] = i[1] citymatch = True if ('city1' in t and 'city2' in t): if((t['city1'] != city1 or t['city2'] != city2) and (t['city1'] != city2 or t['city2'] != city1)): citymatch = False elif ('city1' in t): if (t['city1'] != city1 and t['city1'] != city2): citymatch = False datematch = True if ('when' in t): travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) if (travtime < frtime or travtime > totime): datematch = False costmatch = True if ('maxcost' in t): if (cost > int(t['maxcost'])): costmatch = False if (citymatch and datematch and costmatch): if (t['email'] not in output): output[t['email']] = [] output[t['email']].append([city1, city2, cost, url]) try: mailsubj = conf.get('global', 'mailsubj') mailhost = conf.get('global', 'mailhost') mailsend = conf.getboolean('global', 'mailsend') mailfrom = conf.get('global', 'mailfrom') except ConfigParser.NoOptionError: mailsend = False if (mailsend): server = smtplib.SMTP(mailhost) #server.set_debuglevel(1) else: print "Note: Mail sending disabled" for o in output: if (mailsend): msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)) msg = msg + "Your criteria for flights have been matched\r\n\r\n" else: print "Match for " + o for i in output[o]: if (mailsend): msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) else: print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) ttimestr = "Note: travel period is from %s to %s" % \ (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) if (mailsend): msg = msg + "\r\n" + ttimestr + "\r\n" server.sendmail(mailfrom, o, msg) else: print ttimestr print