Mercurial > ~darius > hgwebdir.cgi > scrape-vb
annotate scrape-vb.py @ 9:3e03facad74b default tip
New example files for latest layout.
author | darius |
---|---|
date | Thu, 18 Oct 2007 06:58:00 +0000 |
parents | d17fd6f3a492 |
children |
rev | line source |
---|---|
1 | 1 #!/usr/bin/env python |
2 | |
3 ############################################################################ | |
4 # Screen scraper for Virgin Blue to look for happy hour deals | |
5 # | |
6 # Prints out (and emails) when criteria match based on cost, | |
7 # destination, etc | |
8 # | |
8 | 9 # $Id: scrape-vb.py,v 1.7 2007/10/18 06:57:35 darius Exp $ |
1 | 10 ############################################################################ |
11 # | |
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | |
13 # | |
14 # Redistribution and use in source and binary forms, with or without | |
15 # modification, are permitted provided that the following conditions | |
16 # are met: | |
17 # 1. Redistributions of source code must retain the above copyright | |
18 # notice, this list of conditions and the following disclaimer. | |
19 # 2. Redistributions in binary form must reproduce the above copyright | |
20 # notice, this list of conditions and the following disclaimer in the | |
21 # documentation and/or other materials provided with the distribution. | |
22 # | |
23 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
24 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
26 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE | |
27 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
28 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
29 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
30 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
33 # SUCH DAMAGE. | |
34 # | |
35 ############################################################################ | |
36 | |
6 | 37 import os, re, BeautifulSoup, datetime, time, smtplib, sys, urllib |
7 | 38 import ConfigParser, optparse, SMSVodaAu |
6 | 39 |
40 usage = '''%prog [options] | |
41 Reads configuration from ./scrape-vb.ini and ~/.scrape-vb.ini''' | |
42 | |
8 | 43 optparse = optparse.OptionParser(usage, version="$Id: scrape-vb.py,v 1.7 2007/10/18 06:57:35 darius Exp $") |
6 | 44 optparse.add_option('-d', '--debug', action="store_true", default=False, |
7 | 45 help="Disable mail & SMS sending, prints message to stdout") |
6 | 46 optparse.add_option('-f', '--file', help="Do not fetch the page, use this file instead") |
47 optparse.add_option('-e', '--example', action="store_true", default=False, | |
48 help="Print an example configuration file to stdout and exit") | |
49 (options, args) = optparse.parse_args() | |
1 | 50 |
6 | 51 if (options.example): |
52 print '''[global] | |
53 mailsubj="Subject line for emails" | |
54 # The following 3 options are necessary before email will be sent | |
55 mailfrom=user@host.com | |
56 mailsend=True | |
57 mailhost=mail.server.com | |
7 | 58 smsuser=0412312312 |
59 smspass=mys3krit | |
60 smssend=True | |
6 | 61 |
62 [user@host.com] | |
63 # All fields are optional | |
64 city1=Foo | |
65 city2=Bar | |
66 when=dd/mm/yy | |
67 maxcost=123 | |
7 | 68 phone=0498765432 |
6 | 69 ''' |
70 sys.exit(0) | |
71 | |
1 | 72 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) |
73 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
74 conf = ConfigParser.ConfigParser() |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
75 conf.add_section('global') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
76 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
77 conf.set('global', 'vburl', 'http://virginblue.com.au') |
5
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
78 |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
79 conflist = ['scrape-vb.ini'] |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
80 if ('HOME' in os.environ): |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
81 conflist.append(os.path.expanduser('~/.scrape-vb.ini')) |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
82 conf.read(conflist) |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
83 |
1 | 84 try: |
6 | 85 if (options.file != None): |
86 f = open(options.file) | |
87 else: | |
88 f = urllib.urlopen(conf.get('global', 'vburl')) | |
1 | 89 except IOError, e: |
90 print "Unable to fetch page - " + str(e) | |
91 sys.exit(1) | |
7 | 92 |
93 # Test if we have been configured to send SMSs | |
94 try: | |
95 smsuser = conf.get('global', 'smsuser') | |
96 smspass = conf.get('global', 'smspass') | |
97 smssend = conf.getboolean('global', 'smssend') | |
98 except ConfigParser.NoOptionError: | |
99 smssend = False | |
100 | |
101 if (options.debug == True and smssend): | |
102 print "smssend overridden due to debugging" | |
103 smssend = False | |
104 | |
105 if (smssend): | |
106 smshndl = SMSVodaAu.SMSVodaAu(smsuser, smspass) | |
107 | |
1 | 108 s = BeautifulSoup.BeautifulSoup(f) |
8 | 109 citypairs = s.findAll("td", "city-pair") |
110 if (citypairs == []): | |
1 | 111 print "No happy hour details found" |
112 sys.exit(0) | |
8 | 113 |
114 prices = s.findAll("td", "dash-r price") | |
115 if (prices == []): | |
116 print "Couldn't find prices" | |
117 sys.exit(0) | |
1 | 118 |
8 | 119 if (len(citypairs) != len(prices)): |
120 print "City pair & price tables don't have equal size" | |
121 sys.exit(0) | |
122 | |
123 times = parsetper.search(s.find('p', 'tandc').string) | |
1 | 124 if (times == None): |
125 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) | |
126 sys.exit(0) | |
127 | |
128 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) | |
129 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) | |
130 | |
7 | 131 # |
132 # Go through the HTML and work out who wants to be notified of what | |
133 # | |
134 # Store in output, a dictionary keyed by email adddress which holds a | |
135 # list of each matching flight (city1, city2, cost, url) | |
136 # | |
1 | 137 output = {} |
8 | 138 for i, p in zip(citypairs, prices): |
1 | 139 href = i.find('a') |
140 | |
8 | 141 city1 = href.next.strip() |
142 city2 = href.next.next.next.next.next.strip() | |
143 cost = int(p.find('a').string.strip('$^ ')) | |
1 | 144 url = href['href'] |
145 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
146 for email in conf.sections(): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
147 if (email == 'global'): |
1 | 148 continue |
6 | 149 # Stuff configuration into a dictionary for our convenience |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
150 t = {'email' : email} |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
151 for i in conf.items(email): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
152 t[i[0]] = i[1] |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
153 |
1 | 154 citymatch = True |
155 if ('city1' in t and 'city2' in t): | |
156 if((t['city1'] != city1 or t['city2'] != city2) and | |
157 (t['city1'] != city2 or t['city2'] != city1)): | |
158 citymatch = False | |
159 elif ('city1' in t): | |
160 if (t['city1'] != city1 and t['city1'] != city2): | |
161 citymatch = False | |
162 | |
163 datematch = True | |
164 if ('when' in t): | |
165 travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) | |
166 if (travtime < frtime or travtime > totime): | |
167 datematch = False | |
168 | |
169 costmatch = True | |
170 if ('maxcost' in t): | |
171 if (cost > int(t['maxcost'])): | |
172 costmatch = False | |
173 | |
174 if (citymatch and datematch and costmatch): | |
175 if (t['email'] not in output): | |
176 | |
177 output[t['email']] = [] | |
178 output[t['email']].append([city1, city2, cost, url]) | |
179 | |
7 | 180 # Test if we have been configured to send email |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
181 try: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
182 mailsubj = conf.get('global', 'mailsubj') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
183 mailhost = conf.get('global', 'mailhost') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
184 mailsend = conf.getboolean('global', 'mailsend') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
185 mailfrom = conf.get('global', 'mailfrom') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
186 except ConfigParser.NoOptionError: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
187 mailsend = False |
6 | 188 |
189 if (options.debug == True and mailsend): | |
190 print "mailsend overridden due to debugging" | |
191 mailsend = False | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
192 |
1 | 193 if (mailsend): |
194 server = smtplib.SMTP(mailhost) | |
195 #server.set_debuglevel(1) | |
7 | 196 |
197 # | |
198 # Output the various notifications | |
199 # | |
200 ttimestr = "Note: travel period is from %s to %s" % \ | |
201 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) | |
202 | |
203 # Email each person about their flights | |
204 if (mailsend): | |
205 for o in output: | |
206 msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj) | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
207 msg = msg + "Your criteria for flights have been matched\r\n\r\n" |
7 | 208 for i in output[o]: |
209 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) | |
210 | |
211 msg = msg + "\r\n" + ttimestr + "\r\n" | |
212 server.sendmail(mailfrom, o, msg) | |
213 | |
214 else: | |
215 # If not emailing print to stdout | |
216 for o in output: | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
217 print "Match for " + o |
7 | 218 for i in output[o]: |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
219 print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) |
1 | 220 |
7 | 221 # SMS each person about their flights |
222 if (smssend): | |
223 for o in output: | |
224 if (conf.has_option(o, 'phone')): | |
8 | 225 pnum = conf.get(o, 'phone') |
7 | 226 msg = "" |
227 for i in output[o]: | |
228 msg = msg + "%s <-> %s $%d, " % (i[0], i[1], i[2]) | |
229 # Chop off the last , & make sure the whole message is not | |
230 # too large. | |
231 msgend = min(len(msg) - 2, 160) | |
232 print msg[0:msgend] | |
8 | 233 try: |
234 smshndl.sendamsg(pnum, msg[0:msgend]) | |
235 print "Sent SMS to " + pnum | |
236 except: | |
237 print "Unable to send SMS to " + pnum |