Mercurial > ~darius > hgwebdir.cgi > scrape-vb
comparison scrape-vb.py @ 1:8045db05180b SCRAPEVB_1_0
Initial revision
author | darius |
---|---|
date | Sat, 25 Aug 2007 05:17:29 +0000 |
parents | |
children | 89232ea0c3d4 |
comparison
equal
deleted
inserted
replaced
0:accc4c4654d7 | 1:8045db05180b |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 ############################################################################ | |
4 # Screen scraper for Virgin Blue to look for happy hour deals | |
5 # | |
6 # Prints out (and emails) when criteria match based on cost, | |
7 # destination, etc | |
8 # | |
9 # $Id: scrape-vb.py,v 1.1.1.1 2007/08/25 05:17:29 darius Exp $ | |
10 ############################################################################ | |
11 # | |
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | |
13 # | |
14 # Redistribution and use in source and binary forms, with or without | |
15 # modification, are permitted provided that the following conditions | |
16 # are met: | |
17 # 1. Redistributions of source code must retain the above copyright | |
18 # notice, this list of conditions and the following disclaimer. | |
19 # 2. Redistributions in binary form must reproduce the above copyright | |
20 # notice, this list of conditions and the following disclaimer in the | |
21 # documentation and/or other materials provided with the distribution. | |
22 # | |
23 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
24 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
26 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE | |
27 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
28 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
29 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
30 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
33 # SUCH DAMAGE. | |
34 # | |
35 ############################################################################ | |
36 | |
37 import re, BeautifulSoup, datetime, time, smtplib, sys, urllib | |
38 | |
39 #### Configuration | |
40 | |
41 ### Travel criteria | |
42 # Supported keys are email, when, city1, city2, maxcost | |
43 # email is mandatory. If city2 is not present either city will be | |
44 # matched. when and maxcost are optional (will match for any date or | |
45 # cost) | |
46 travellers = [ | |
47 { 'email' : 'darius@dons.net.au', 'city1' : 'Sydney' }, | |
48 { 'email' : 'sarah.mahoney@nehta.gov.au', 'city1' : 'Adelaide', 'city2' : 'Brisbane' }, | |
49 ] | |
50 | |
51 ### Mail host | |
52 mailhost = 'mail.dons.net.au' | |
53 | |
54 ### Who the email is from | |
55 mailfrom = 'darius@dons.net.au' | |
56 | |
57 ### What's onn the subject linee | |
58 mailsubj = 'Virgin Blue Happy Hour Deals' | |
59 | |
60 ### Actually send email? | |
61 mailsend = False | |
62 | |
63 ### URL to parse | |
64 vburl = 'http://virginblue.com.au' | |
65 | |
66 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) | |
67 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) | |
68 | |
69 try: | |
70 #f = open("vb-happyhour.html") | |
71 f = urllib.urlopen(vburl) | |
72 except IOError, e: | |
73 print "Unable to fetch page - " + str(e) | |
74 sys.exit(1) | |
75 | |
76 s = BeautifulSoup.BeautifulSoup(f) | |
77 hrr = s.find("ul", "happyhr-rows") | |
78 if (hrr == None): | |
79 print "No happy hour details found" | |
80 sys.exit(0) | |
81 | |
82 hrlist = hrr.findAll("li") | |
83 | |
84 # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it | |
85 # doesn't work | |
86 times = parsetper.match(s.findAll('ul')[11].find('li').string) | |
87 if (times == None): | |
88 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) | |
89 sys.exit(0) | |
90 | |
91 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) | |
92 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) | |
93 | |
94 #print "Travel from %s to %s" % (str(frtime), str(totime)) | |
95 | |
96 output = {} | |
97 for i in hrlist: | |
98 href = i.find('a') | |
99 match = parsetitle.match(href['title']) | |
100 if (match == None): | |
101 print "Unable to match " + str(s) | |
102 continue | |
103 | |
104 city1 = match.group(1) | |
105 city2 = match.group(2) | |
106 cost = int(match.group(3)) | |
107 url = href['href'] | |
108 | |
109 for t in travellers: | |
110 if ('email' not in t): | |
111 print "No email key found, configuration error?" | |
112 continue | |
113 | |
114 citymatch = True | |
115 if ('city1' in t and 'city2' in t): | |
116 if((t['city1'] != city1 or t['city2'] != city2) and | |
117 (t['city1'] != city2 or t['city2'] != city1)): | |
118 citymatch = False | |
119 elif ('city1' in t): | |
120 if (t['city1'] != city1 and t['city1'] != city2): | |
121 citymatch = False | |
122 | |
123 datematch = True | |
124 if ('when' in t): | |
125 travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) | |
126 if (travtime < frtime or travtime > totime): | |
127 datematch = False | |
128 | |
129 costmatch = True | |
130 if ('maxcost' in t): | |
131 if (cost > int(t['maxcost'])): | |
132 costmatch = False | |
133 | |
134 if (citymatch and datematch and costmatch): | |
135 if (t['email'] not in output): | |
136 | |
137 output[t['email']] = [] | |
138 output[t['email']].append([city1, city2, cost, url]) | |
139 | |
140 if (mailsend): | |
141 server = smtplib.SMTP(mailhost) | |
142 #server.set_debuglevel(1) | |
143 | |
144 for o in output: | |
145 msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)) | |
146 msg = msg + "Your criteria for flights have been matched\r\n\r\n" | |
147 print "Sending email to " + o | |
148 for i in output[o]: | |
149 print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) | |
150 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) | |
151 | |
152 msg = msg + "\r\nNote: travel period is from %s to %s" % \ | |
153 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) | |
154 if (mailsend): | |
155 server.sendmail(mailfrom, o, msg) | |
156 else: | |
157 print msg | |
158 print |