Mercurial > ~darius > hgwebdir.cgi > scrape-vb
annotate scrape-vb.py @ 7:bf896507faa9
Add code to send an SMS if configured to do so.
Rearrange the output stage a bit to make it clearer.
author | darius |
---|---|
date | Fri, 07 Sep 2007 01:31:47 +0000 |
parents | 9f3eb9a07966 |
children | d17fd6f3a492 |
rev | line source |
---|---|
1 | 1 #!/usr/bin/env python |
2 | |
3 ############################################################################ | |
4 # Screen scraper for Virgin Blue to look for happy hour deals | |
5 # | |
6 # Prints out (and emails) when criteria match based on cost, | |
7 # destination, etc | |
8 # | |
7 | 9 # $Id: scrape-vb.py,v 1.6 2007/09/07 01:31:47 darius Exp $ |
1 | 10 ############################################################################ |
11 # | |
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | |
13 # | |
14 # Redistribution and use in source and binary forms, with or without | |
15 # modification, are permitted provided that the following conditions | |
16 # are met: | |
17 # 1. Redistributions of source code must retain the above copyright | |
18 # notice, this list of conditions and the following disclaimer. | |
19 # 2. Redistributions in binary form must reproduce the above copyright | |
20 # notice, this list of conditions and the following disclaimer in the | |
21 # documentation and/or other materials provided with the distribution. | |
22 # | |
23 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
24 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
26 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE | |
27 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
28 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
29 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
30 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
33 # SUCH DAMAGE. | |
34 # | |
35 ############################################################################ | |
36 | |
6 | 37 import os, re, BeautifulSoup, datetime, time, smtplib, sys, urllib |
7 | 38 import ConfigParser, optparse, SMSVodaAu |
6 | 39 |
40 usage = '''%prog [options] | |
41 Reads configuration from ./scrape-vb.ini and ~/.scrape-vb.ini''' | |
42 | |
7 | 43 optparse = optparse.OptionParser(usage, version="$Id: scrape-vb.py,v 1.6 2007/09/07 01:31:47 darius Exp $") |
6 | 44 optparse.add_option('-d', '--debug', action="store_true", default=False, |
7 | 45 help="Disable mail & SMS sending, prints message to stdout") |
6 | 46 optparse.add_option('-f', '--file', help="Do not fetch the page, use this file instead") |
47 optparse.add_option('-e', '--example', action="store_true", default=False, | |
48 help="Print an example configuration file to stdout and exit") | |
49 (options, args) = optparse.parse_args() | |
1 | 50 |
6 | 51 if (options.example): |
52 print '''[global] | |
53 mailsubj="Subject line for emails" | |
54 # The following 3 options are necessary before email will be sent | |
55 mailfrom=user@host.com | |
56 mailsend=True | |
57 mailhost=mail.server.com | |
7 | 58 smsuser=0412312312 |
59 smspass=mys3krit | |
60 smssend=True | |
6 | 61 |
62 [user@host.com] | |
63 # All fields are optional | |
64 city1=Foo | |
65 city2=Bar | |
66 when=dd/mm/yy | |
67 maxcost=123 | |
7 | 68 phone=0498765432 |
6 | 69 ''' |
70 sys.exit(0) | |
71 | |
1 | 72 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) |
73 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) | |
74 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
75 conf = ConfigParser.ConfigParser() |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
76 conf.add_section('global') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
77 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
78 conf.set('global', 'vburl', 'http://virginblue.com.au') |
5
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
79 |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
80 conflist = ['scrape-vb.ini'] |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
81 if ('HOME' in os.environ): |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
82 conflist.append(os.path.expanduser('~/.scrape-vb.ini')) |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
83 conf.read(conflist) |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
84 |
1 | 85 try: |
6 | 86 if (options.file != None): |
87 f = open(options.file) | |
88 else: | |
89 f = urllib.urlopen(conf.get('global', 'vburl')) | |
1 | 90 except IOError, e: |
91 print "Unable to fetch page - " + str(e) | |
92 sys.exit(1) | |
7 | 93 |
94 # Test if we have been configured to send SMSs | |
95 try: | |
96 smsuser = conf.get('global', 'smsuser') | |
97 smspass = conf.get('global', 'smspass') | |
98 smssend = conf.getboolean('global', 'smssend') | |
99 except ConfigParser.NoOptionError: | |
100 smssend = False | |
101 | |
102 if (options.debug == True and smssend): | |
103 print "smssend overridden due to debugging" | |
104 smssend = False | |
105 | |
106 if (smssend): | |
107 smshndl = SMSVodaAu.SMSVodaAu(smsuser, smspass) | |
108 | |
1 | 109 s = BeautifulSoup.BeautifulSoup(f) |
110 hrr = s.find("ul", "happyhr-rows") | |
111 if (hrr == None): | |
112 print "No happy hour details found" | |
113 sys.exit(0) | |
114 | |
115 hrlist = hrr.findAll("li") | |
116 | |
117 # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it | |
118 # doesn't work | |
119 times = parsetper.match(s.findAll('ul')[11].find('li').string) | |
120 if (times == None): | |
121 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) | |
122 sys.exit(0) | |
123 | |
124 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) | |
125 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) | |
126 | |
7 | 127 # |
128 # Go through the HTML and work out who wants to be notified of what | |
129 # | |
130 # Store in output, a dictionary keyed by email adddress which holds a | |
131 # list of each matching flight (city1, city2, cost, url) | |
132 # | |
1 | 133 output = {} |
134 for i in hrlist: | |
135 href = i.find('a') | |
136 match = parsetitle.match(href['title']) | |
137 if (match == None): | |
138 print "Unable to match " + str(s) | |
139 continue | |
140 | |
141 city1 = match.group(1) | |
142 city2 = match.group(2) | |
143 cost = int(match.group(3)) | |
144 url = href['href'] | |
145 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
146 for email in conf.sections(): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
147 if (email == 'global'): |
1 | 148 continue |
6 | 149 # Stuff configuration into a dictionary for our convenience |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
150 t = {'email' : email} |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
151 for i in conf.items(email): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
152 t[i[0]] = i[1] |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
153 |
1 | 154 citymatch = True |
155 if ('city1' in t and 'city2' in t): | |
156 if((t['city1'] != city1 or t['city2'] != city2) and | |
157 (t['city1'] != city2 or t['city2'] != city1)): | |
158 citymatch = False | |
159 elif ('city1' in t): | |
160 if (t['city1'] != city1 and t['city1'] != city2): | |
161 citymatch = False | |
162 | |
163 datematch = True | |
164 if ('when' in t): | |
165 travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) | |
166 if (travtime < frtime or travtime > totime): | |
167 datematch = False | |
168 | |
169 costmatch = True | |
170 if ('maxcost' in t): | |
171 if (cost > int(t['maxcost'])): | |
172 costmatch = False | |
173 | |
174 if (citymatch and datematch and costmatch): | |
175 if (t['email'] not in output): | |
176 | |
177 output[t['email']] = [] | |
178 output[t['email']].append([city1, city2, cost, url]) | |
179 | |
7 | 180 # Test if we have been configured to send email |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
181 try: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
182 mailsubj = conf.get('global', 'mailsubj') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
183 mailhost = conf.get('global', 'mailhost') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
184 mailsend = conf.getboolean('global', 'mailsend') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
185 mailfrom = conf.get('global', 'mailfrom') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
186 except ConfigParser.NoOptionError: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
187 mailsend = False |
6 | 188 |
189 if (options.debug == True and mailsend): | |
190 print "mailsend overridden due to debugging" | |
191 mailsend = False | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
192 |
1 | 193 if (mailsend): |
194 server = smtplib.SMTP(mailhost) | |
195 #server.set_debuglevel(1) | |
7 | 196 |
197 # | |
198 # Output the various notifications | |
199 # | |
200 ttimestr = "Note: travel period is from %s to %s" % \ | |
201 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) | |
202 | |
203 # Email each person about their flights | |
204 if (mailsend): | |
205 for o in output: | |
206 msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj) | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
207 msg = msg + "Your criteria for flights have been matched\r\n\r\n" |
7 | 208 for i in output[o]: |
209 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) | |
210 | |
211 msg = msg + "\r\n" + ttimestr + "\r\n" | |
212 server.sendmail(mailfrom, o, msg) | |
213 | |
214 else: | |
215 # If not emailing print to stdout | |
216 for o in output: | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
217 print "Match for " + o |
7 | 218 for i in output[o]: |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
219 print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) |
1 | 220 |
7 | 221 # SMS each person about their flights |
222 if (smssend): | |
223 for o in output: | |
224 if (conf.has_option(o, 'phone')): | |
225 msg = "" | |
226 for i in output[o]: | |
227 msg = msg + "%s <-> %s $%d, " % (i[0], i[1], i[2]) | |
228 # Chop off the last , & make sure the whole message is not | |
229 # too large. | |
230 msgend = min(len(msg) - 2, 160) | |
231 print "SMS to " + conf.get(o, 'phone') | |
232 print msg[0:msgend] | |
233 smshndl.sendamsg(conf.get(o, 'phone'), msg[0:msgend]) |