annotate scrape-vb.py @ 4:e3f4ef0b6e39

Oops, read URL from configuration like I planned.
author darius
date Mon, 27 Aug 2007 02:29:27 +0000
parents 89232ea0c3d4
children 275603a8e2ae
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
8045db05180b Initial revision
darius
parents:
diff changeset
1 #!/usr/bin/env python
8045db05180b Initial revision
darius
parents:
diff changeset
2
8045db05180b Initial revision
darius
parents:
diff changeset
3 ############################################################################
8045db05180b Initial revision
darius
parents:
diff changeset
4 # Screen scraper for Virgin Blue to look for happy hour deals
8045db05180b Initial revision
darius
parents:
diff changeset
5 #
8045db05180b Initial revision
darius
parents:
diff changeset
6 # Prints out (and emails) when criteria match based on cost,
8045db05180b Initial revision
darius
parents:
diff changeset
7 # destination, etc
8045db05180b Initial revision
darius
parents:
diff changeset
8 #
4
e3f4ef0b6e39 Oops, read URL from configuration like I planned.
darius
parents: 3
diff changeset
9 # $Id: scrape-vb.py,v 1.3 2007/08/27 02:29:27 darius Exp $
1
8045db05180b Initial revision
darius
parents:
diff changeset
10 ############################################################################
8045db05180b Initial revision
darius
parents:
diff changeset
11 #
8045db05180b Initial revision
darius
parents:
diff changeset
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved.
8045db05180b Initial revision
darius
parents:
diff changeset
13 #
8045db05180b Initial revision
darius
parents:
diff changeset
14 # Redistribution and use in source and binary forms, with or without
8045db05180b Initial revision
darius
parents:
diff changeset
15 # modification, are permitted provided that the following conditions
8045db05180b Initial revision
darius
parents:
diff changeset
16 # are met:
8045db05180b Initial revision
darius
parents:
diff changeset
17 # 1. Redistributions of source code must retain the above copyright
8045db05180b Initial revision
darius
parents:
diff changeset
18 # notice, this list of conditions and the following disclaimer.
8045db05180b Initial revision
darius
parents:
diff changeset
19 # 2. Redistributions in binary form must reproduce the above copyright
8045db05180b Initial revision
darius
parents:
diff changeset
20 # notice, this list of conditions and the following disclaimer in the
8045db05180b Initial revision
darius
parents:
diff changeset
21 # documentation and/or other materials provided with the distribution.
8045db05180b Initial revision
darius
parents:
diff changeset
22 #
8045db05180b Initial revision
darius
parents:
diff changeset
23 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
8045db05180b Initial revision
darius
parents:
diff changeset
24 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
8045db05180b Initial revision
darius
parents:
diff changeset
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
8045db05180b Initial revision
darius
parents:
diff changeset
26 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
8045db05180b Initial revision
darius
parents:
diff changeset
27 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
8045db05180b Initial revision
darius
parents:
diff changeset
28 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
8045db05180b Initial revision
darius
parents:
diff changeset
29 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
8045db05180b Initial revision
darius
parents:
diff changeset
30 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
8045db05180b Initial revision
darius
parents:
diff changeset
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
8045db05180b Initial revision
darius
parents:
diff changeset
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
8045db05180b Initial revision
darius
parents:
diff changeset
33 # SUCH DAMAGE.
8045db05180b Initial revision
darius
parents:
diff changeset
34 #
8045db05180b Initial revision
darius
parents:
diff changeset
35 ############################################################################
8045db05180b Initial revision
darius
parents:
diff changeset
36
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
37 import re, BeautifulSoup, datetime, time, smtplib, sys, urllib, ConfigParser
1
8045db05180b Initial revision
darius
parents:
diff changeset
38
8045db05180b Initial revision
darius
parents:
diff changeset
39 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE)
8045db05180b Initial revision
darius
parents:
diff changeset
40 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE)
8045db05180b Initial revision
darius
parents:
diff changeset
41
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
42 conf = ConfigParser.ConfigParser()
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
43 conf.add_section('global')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
44 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
45 conf.set('global', 'vburl', 'http://virginblue.com.au')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
46 conf.read('scrape-vb.ini')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
47
1
8045db05180b Initial revision
darius
parents:
diff changeset
48 try:
8045db05180b Initial revision
darius
parents:
diff changeset
49 #f = open("vb-happyhour.html")
4
e3f4ef0b6e39 Oops, read URL from configuration like I planned.
darius
parents: 3
diff changeset
50 f = urllib.urlopen(conf.get('global', 'vburl'))
1
8045db05180b Initial revision
darius
parents:
diff changeset
51 except IOError, e:
8045db05180b Initial revision
darius
parents:
diff changeset
52 print "Unable to fetch page - " + str(e)
8045db05180b Initial revision
darius
parents:
diff changeset
53 sys.exit(1)
8045db05180b Initial revision
darius
parents:
diff changeset
54
8045db05180b Initial revision
darius
parents:
diff changeset
55 s = BeautifulSoup.BeautifulSoup(f)
8045db05180b Initial revision
darius
parents:
diff changeset
56 hrr = s.find("ul", "happyhr-rows")
8045db05180b Initial revision
darius
parents:
diff changeset
57 if (hrr == None):
8045db05180b Initial revision
darius
parents:
diff changeset
58 print "No happy hour details found"
8045db05180b Initial revision
darius
parents:
diff changeset
59 sys.exit(0)
8045db05180b Initial revision
darius
parents:
diff changeset
60
8045db05180b Initial revision
darius
parents:
diff changeset
61 hrlist = hrr.findAll("li")
8045db05180b Initial revision
darius
parents:
diff changeset
62
8045db05180b Initial revision
darius
parents:
diff changeset
63 # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it
8045db05180b Initial revision
darius
parents:
diff changeset
64 # doesn't work
8045db05180b Initial revision
darius
parents:
diff changeset
65 times = parsetper.match(s.findAll('ul')[11].find('li').string)
8045db05180b Initial revision
darius
parents:
diff changeset
66 if (times == None):
8045db05180b Initial revision
darius
parents:
diff changeset
67 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li'))
8045db05180b Initial revision
darius
parents:
diff changeset
68 sys.exit(0)
8045db05180b Initial revision
darius
parents:
diff changeset
69
8045db05180b Initial revision
darius
parents:
diff changeset
70 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3])
8045db05180b Initial revision
darius
parents:
diff changeset
71 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3])
8045db05180b Initial revision
darius
parents:
diff changeset
72
8045db05180b Initial revision
darius
parents:
diff changeset
73 output = {}
8045db05180b Initial revision
darius
parents:
diff changeset
74 for i in hrlist:
8045db05180b Initial revision
darius
parents:
diff changeset
75 href = i.find('a')
8045db05180b Initial revision
darius
parents:
diff changeset
76 match = parsetitle.match(href['title'])
8045db05180b Initial revision
darius
parents:
diff changeset
77 if (match == None):
8045db05180b Initial revision
darius
parents:
diff changeset
78 print "Unable to match " + str(s)
8045db05180b Initial revision
darius
parents:
diff changeset
79 continue
8045db05180b Initial revision
darius
parents:
diff changeset
80
8045db05180b Initial revision
darius
parents:
diff changeset
81 city1 = match.group(1)
8045db05180b Initial revision
darius
parents:
diff changeset
82 city2 = match.group(2)
8045db05180b Initial revision
darius
parents:
diff changeset
83 cost = int(match.group(3))
8045db05180b Initial revision
darius
parents:
diff changeset
84 url = href['href']
8045db05180b Initial revision
darius
parents:
diff changeset
85
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
86 for email in conf.sections():
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
87 if (email == 'global'):
1
8045db05180b Initial revision
darius
parents:
diff changeset
88 continue
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
89
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
90 t = {'email' : email}
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
91 for i in conf.items(email):
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
92 t[i[0]] = i[1]
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
93
1
8045db05180b Initial revision
darius
parents:
diff changeset
94 citymatch = True
8045db05180b Initial revision
darius
parents:
diff changeset
95 if ('city1' in t and 'city2' in t):
8045db05180b Initial revision
darius
parents:
diff changeset
96 if((t['city1'] != city1 or t['city2'] != city2) and
8045db05180b Initial revision
darius
parents:
diff changeset
97 (t['city1'] != city2 or t['city2'] != city1)):
8045db05180b Initial revision
darius
parents:
diff changeset
98 citymatch = False
8045db05180b Initial revision
darius
parents:
diff changeset
99 elif ('city1' in t):
8045db05180b Initial revision
darius
parents:
diff changeset
100 if (t['city1'] != city1 and t['city1'] != city2):
8045db05180b Initial revision
darius
parents:
diff changeset
101 citymatch = False
8045db05180b Initial revision
darius
parents:
diff changeset
102
8045db05180b Initial revision
darius
parents:
diff changeset
103 datematch = True
8045db05180b Initial revision
darius
parents:
diff changeset
104 if ('when' in t):
8045db05180b Initial revision
darius
parents:
diff changeset
105 travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3])
8045db05180b Initial revision
darius
parents:
diff changeset
106 if (travtime < frtime or travtime > totime):
8045db05180b Initial revision
darius
parents:
diff changeset
107 datematch = False
8045db05180b Initial revision
darius
parents:
diff changeset
108
8045db05180b Initial revision
darius
parents:
diff changeset
109 costmatch = True
8045db05180b Initial revision
darius
parents:
diff changeset
110 if ('maxcost' in t):
8045db05180b Initial revision
darius
parents:
diff changeset
111 if (cost > int(t['maxcost'])):
8045db05180b Initial revision
darius
parents:
diff changeset
112 costmatch = False
8045db05180b Initial revision
darius
parents:
diff changeset
113
8045db05180b Initial revision
darius
parents:
diff changeset
114 if (citymatch and datematch and costmatch):
8045db05180b Initial revision
darius
parents:
diff changeset
115 if (t['email'] not in output):
8045db05180b Initial revision
darius
parents:
diff changeset
116
8045db05180b Initial revision
darius
parents:
diff changeset
117 output[t['email']] = []
8045db05180b Initial revision
darius
parents:
diff changeset
118 output[t['email']].append([city1, city2, cost, url])
8045db05180b Initial revision
darius
parents:
diff changeset
119
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
120 try:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
121 mailsubj = conf.get('global', 'mailsubj')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
122 mailhost = conf.get('global', 'mailhost')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
123 mailsend = conf.getboolean('global', 'mailsend')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
124 mailfrom = conf.get('global', 'mailfrom')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
125 except ConfigParser.NoOptionError:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
126 mailsend = False
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
127
1
8045db05180b Initial revision
darius
parents:
diff changeset
128 if (mailsend):
8045db05180b Initial revision
darius
parents:
diff changeset
129 server = smtplib.SMTP(mailhost)
8045db05180b Initial revision
darius
parents:
diff changeset
130 #server.set_debuglevel(1)
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
131 else:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
132 print "Note: Mail sending disabled"
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
133
1
8045db05180b Initial revision
darius
parents:
diff changeset
134 for o in output:
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
135 if (mailsend):
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
136 msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj))
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
137 msg = msg + "Your criteria for flights have been matched\r\n\r\n"
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
138 else:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
139 print "Match for " + o
1
8045db05180b Initial revision
darius
parents:
diff changeset
140 for i in output[o]:
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
141 if (mailsend):
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
142 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3])
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
143 else:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
144 print "%s <-> %s costs $%d" % (i[0], i[1], i[2])
1
8045db05180b Initial revision
darius
parents:
diff changeset
145
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
146 ttimestr = "Note: travel period is from %s to %s" % \
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
147 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y"))
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
148
1
8045db05180b Initial revision
darius
parents:
diff changeset
149 if (mailsend):
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
150 msg = msg + "\r\n" + ttimestr + "\r\n"
1
8045db05180b Initial revision
darius
parents:
diff changeset
151 server.sendmail(mailfrom, o, msg)
8045db05180b Initial revision
darius
parents:
diff changeset
152 else:
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
153 print ttimestr
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
154 print