Mercurial > ~darius > hgwebdir.cgi > scrape-vb
comparison scrape-vb.py @ 8:d17fd6f3a492
- Catch up with new VB page layout.
- Don't explode if we can't send an SMS, just log it.
author | darius |
---|---|
date | Thu, 18 Oct 2007 06:57:35 +0000 |
parents | bf896507faa9 |
children |
comparison
equal
deleted
inserted
replaced
7:bf896507faa9 | 8:d17fd6f3a492 |
---|---|
4 # Screen scraper for Virgin Blue to look for happy hour deals | 4 # Screen scraper for Virgin Blue to look for happy hour deals |
5 # | 5 # |
6 # Prints out (and emails) when criteria match based on cost, | 6 # Prints out (and emails) when criteria match based on cost, |
7 # destination, etc | 7 # destination, etc |
8 # | 8 # |
9 # $Id: scrape-vb.py,v 1.6 2007/09/07 01:31:47 darius Exp $ | 9 # $Id: scrape-vb.py,v 1.7 2007/10/18 06:57:35 darius Exp $ |
10 ############################################################################ | 10 ############################################################################ |
11 # | 11 # |
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | 12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. |
13 # | 13 # |
14 # Redistribution and use in source and binary forms, with or without | 14 # Redistribution and use in source and binary forms, with or without |
38 import ConfigParser, optparse, SMSVodaAu | 38 import ConfigParser, optparse, SMSVodaAu |
39 | 39 |
40 usage = '''%prog [options] | 40 usage = '''%prog [options] |
41 Reads configuration from ./scrape-vb.ini and ~/.scrape-vb.ini''' | 41 Reads configuration from ./scrape-vb.ini and ~/.scrape-vb.ini''' |
42 | 42 |
43 optparse = optparse.OptionParser(usage, version="$Id: scrape-vb.py,v 1.6 2007/09/07 01:31:47 darius Exp $") | 43 optparse = optparse.OptionParser(usage, version="$Id: scrape-vb.py,v 1.7 2007/10/18 06:57:35 darius Exp $") |
44 optparse.add_option('-d', '--debug', action="store_true", default=False, | 44 optparse.add_option('-d', '--debug', action="store_true", default=False, |
45 help="Disable mail & SMS sending, prints message to stdout") | 45 help="Disable mail & SMS sending, prints message to stdout") |
46 optparse.add_option('-f', '--file', help="Do not fetch the page, use this file instead") | 46 optparse.add_option('-f', '--file', help="Do not fetch the page, use this file instead") |
47 optparse.add_option('-e', '--example', action="store_true", default=False, | 47 optparse.add_option('-e', '--example', action="store_true", default=False, |
48 help="Print an example configuration file to stdout and exit") | 48 help="Print an example configuration file to stdout and exit") |
67 maxcost=123 | 67 maxcost=123 |
68 phone=0498765432 | 68 phone=0498765432 |
69 ''' | 69 ''' |
70 sys.exit(0) | 70 sys.exit(0) |
71 | 71 |
72 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) | |
73 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) | 72 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) |
74 | 73 |
75 conf = ConfigParser.ConfigParser() | 74 conf = ConfigParser.ConfigParser() |
76 conf.add_section('global') | 75 conf.add_section('global') |
77 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') | 76 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') |
105 | 104 |
106 if (smssend): | 105 if (smssend): |
107 smshndl = SMSVodaAu.SMSVodaAu(smsuser, smspass) | 106 smshndl = SMSVodaAu.SMSVodaAu(smsuser, smspass) |
108 | 107 |
109 s = BeautifulSoup.BeautifulSoup(f) | 108 s = BeautifulSoup.BeautifulSoup(f) |
110 hrr = s.find("ul", "happyhr-rows") | 109 citypairs = s.findAll("td", "city-pair") |
111 if (hrr == None): | 110 if (citypairs == []): |
112 print "No happy hour details found" | 111 print "No happy hour details found" |
113 sys.exit(0) | 112 sys.exit(0) |
114 | 113 |
115 hrlist = hrr.findAll("li") | 114 prices = s.findAll("td", "dash-r price") |
116 | 115 if (prices == []): |
117 # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it | 116 print "Couldn't find prices" |
118 # doesn't work | 117 sys.exit(0) |
119 times = parsetper.match(s.findAll('ul')[11].find('li').string) | 118 |
119 if (len(citypairs) != len(prices)): | |
120 print "City pair & price tables don't have equal size" | |
121 sys.exit(0) | |
122 | |
123 times = parsetper.search(s.find('p', 'tandc').string) | |
120 if (times == None): | 124 if (times == None): |
121 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) | 125 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) |
122 sys.exit(0) | 126 sys.exit(0) |
123 | 127 |
124 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) | 128 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) |
129 # | 133 # |
130 # Store in output, a dictionary keyed by email adddress which holds a | 134 # Store in output, a dictionary keyed by email adddress which holds a |
131 # list of each matching flight (city1, city2, cost, url) | 135 # list of each matching flight (city1, city2, cost, url) |
132 # | 136 # |
133 output = {} | 137 output = {} |
134 for i in hrlist: | 138 for i, p in zip(citypairs, prices): |
135 href = i.find('a') | 139 href = i.find('a') |
136 match = parsetitle.match(href['title']) | 140 |
137 if (match == None): | 141 city1 = href.next.strip() |
138 print "Unable to match " + str(s) | 142 city2 = href.next.next.next.next.next.strip() |
139 continue | 143 cost = int(p.find('a').string.strip('$^ ')) |
140 | |
141 city1 = match.group(1) | |
142 city2 = match.group(2) | |
143 cost = int(match.group(3)) | |
144 url = href['href'] | 144 url = href['href'] |
145 | 145 |
146 for email in conf.sections(): | 146 for email in conf.sections(): |
147 if (email == 'global'): | 147 if (email == 'global'): |
148 continue | 148 continue |
220 | 220 |
221 # SMS each person about their flights | 221 # SMS each person about their flights |
222 if (smssend): | 222 if (smssend): |
223 for o in output: | 223 for o in output: |
224 if (conf.has_option(o, 'phone')): | 224 if (conf.has_option(o, 'phone')): |
225 pnum = conf.get(o, 'phone') | |
225 msg = "" | 226 msg = "" |
226 for i in output[o]: | 227 for i in output[o]: |
227 msg = msg + "%s <-> %s $%d, " % (i[0], i[1], i[2]) | 228 msg = msg + "%s <-> %s $%d, " % (i[0], i[1], i[2]) |
228 # Chop off the last , & make sure the whole message is not | 229 # Chop off the last , & make sure the whole message is not |
229 # too large. | 230 # too large. |
230 msgend = min(len(msg) - 2, 160) | 231 msgend = min(len(msg) - 2, 160) |
231 print "SMS to " + conf.get(o, 'phone') | |
232 print msg[0:msgend] | 232 print msg[0:msgend] |
233 smshndl.sendamsg(conf.get(o, 'phone'), msg[0:msgend]) | 233 try: |
234 smshndl.sendamsg(pnum, msg[0:msgend]) | |
235 print "Sent SMS to " + pnum | |
236 except: | |
237 print "Unable to send SMS to " + pnum |