1
|
1 #!/usr/bin/env python
|
|
2
|
|
3 ############################################################################
|
|
4 # Screen scraper for Virgin Blue to look for happy hour deals
|
|
5 #
|
|
6 # Prints out (and emails) when criteria match based on cost,
|
|
7 # destination, etc
|
|
8 #
|
|
9 # $Id: scrape-vb.py,v 1.1.1.1 2007/08/25 05:17:29 darius Exp $
|
|
10 ############################################################################
|
|
11 #
|
|
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved.
|
|
13 #
|
|
14 # Redistribution and use in source and binary forms, with or without
|
|
15 # modification, are permitted provided that the following conditions
|
|
16 # are met:
|
|
17 # 1. Redistributions of source code must retain the above copyright
|
|
18 # notice, this list of conditions and the following disclaimer.
|
|
19 # 2. Redistributions in binary form must reproduce the above copyright
|
|
20 # notice, this list of conditions and the following disclaimer in the
|
|
21 # documentation and/or other materials provided with the distribution.
|
|
22 #
|
|
23 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
24 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
26 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
27 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
28 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
29 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
30 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
33 # SUCH DAMAGE.
|
|
34 #
|
|
35 ############################################################################
|
|
36
|
|
37 import re, BeautifulSoup, datetime, time, smtplib, sys, urllib
|
|
38
|
|
39 #### Configuration
|
|
40
|
|
41 ### Travel criteria
|
|
42 # Supported keys are email, when, city1, city2, maxcost
|
|
43 # email is mandatory. If city2 is not present either city will be
|
|
44 # matched. when and maxcost are optional (will match for any date or
|
|
45 # cost)
|
|
46 travellers = [
|
|
47 { 'email' : 'darius@dons.net.au', 'city1' : 'Sydney' },
|
|
48 { 'email' : 'sarah.mahoney@nehta.gov.au', 'city1' : 'Adelaide', 'city2' : 'Brisbane' },
|
|
49 ]
|
|
50
|
|
51 ### Mail host
|
|
52 mailhost = 'mail.dons.net.au'
|
|
53
|
|
54 ### Who the email is from
|
|
55 mailfrom = 'darius@dons.net.au'
|
|
56
|
|
57 ### What's onn the subject linee
|
|
58 mailsubj = 'Virgin Blue Happy Hour Deals'
|
|
59
|
|
60 ### Actually send email?
|
|
61 mailsend = False
|
|
62
|
|
63 ### URL to parse
|
|
64 vburl = 'http://virginblue.com.au'
|
|
65
|
|
66 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE)
|
|
67 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE)
|
|
68
|
|
69 try:
|
|
70 #f = open("vb-happyhour.html")
|
|
71 f = urllib.urlopen(vburl)
|
|
72 except IOError, e:
|
|
73 print "Unable to fetch page - " + str(e)
|
|
74 sys.exit(1)
|
|
75
|
|
76 s = BeautifulSoup.BeautifulSoup(f)
|
|
77 hrr = s.find("ul", "happyhr-rows")
|
|
78 if (hrr == None):
|
|
79 print "No happy hour details found"
|
|
80 sys.exit(0)
|
|
81
|
|
82 hrlist = hrr.findAll("li")
|
|
83
|
|
84 # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it
|
|
85 # doesn't work
|
|
86 times = parsetper.match(s.findAll('ul')[11].find('li').string)
|
|
87 if (times == None):
|
|
88 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li'))
|
|
89 sys.exit(0)
|
|
90
|
|
91 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3])
|
|
92 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3])
|
|
93
|
|
94 #print "Travel from %s to %s" % (str(frtime), str(totime))
|
|
95
|
|
96 output = {}
|
|
97 for i in hrlist:
|
|
98 href = i.find('a')
|
|
99 match = parsetitle.match(href['title'])
|
|
100 if (match == None):
|
|
101 print "Unable to match " + str(s)
|
|
102 continue
|
|
103
|
|
104 city1 = match.group(1)
|
|
105 city2 = match.group(2)
|
|
106 cost = int(match.group(3))
|
|
107 url = href['href']
|
|
108
|
|
109 for t in travellers:
|
|
110 if ('email' not in t):
|
|
111 print "No email key found, configuration error?"
|
|
112 continue
|
|
113
|
|
114 citymatch = True
|
|
115 if ('city1' in t and 'city2' in t):
|
|
116 if((t['city1'] != city1 or t['city2'] != city2) and
|
|
117 (t['city1'] != city2 or t['city2'] != city1)):
|
|
118 citymatch = False
|
|
119 elif ('city1' in t):
|
|
120 if (t['city1'] != city1 and t['city1'] != city2):
|
|
121 citymatch = False
|
|
122
|
|
123 datematch = True
|
|
124 if ('when' in t):
|
|
125 travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3])
|
|
126 if (travtime < frtime or travtime > totime):
|
|
127 datematch = False
|
|
128
|
|
129 costmatch = True
|
|
130 if ('maxcost' in t):
|
|
131 if (cost > int(t['maxcost'])):
|
|
132 costmatch = False
|
|
133
|
|
134 if (citymatch and datematch and costmatch):
|
|
135 if (t['email'] not in output):
|
|
136
|
|
137 output[t['email']] = []
|
|
138 output[t['email']].append([city1, city2, cost, url])
|
|
139
|
|
140 if (mailsend):
|
|
141 server = smtplib.SMTP(mailhost)
|
|
142 #server.set_debuglevel(1)
|
|
143
|
|
144 for o in output:
|
|
145 msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj))
|
|
146 msg = msg + "Your criteria for flights have been matched\r\n\r\n"
|
|
147 print "Sending email to " + o
|
|
148 for i in output[o]:
|
|
149 print "%s <-> %s costs $%d" % (i[0], i[1], i[2])
|
|
150 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3])
|
|
151
|
|
152 msg = msg + "\r\nNote: travel period is from %s to %s" % \
|
|
153 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y"))
|
|
154 if (mailsend):
|
|
155 server.sendmail(mailfrom, o, msg)
|
|
156 else:
|
|
157 print msg
|
|
158 print
|