annotate scrape-vb.py @ 6:9f3eb9a07966

Add config parser. Add ability to print out an example config file. Add -f option to read a saved page.
author darius
date Wed, 29 Aug 2007 07:37:59 +0000
parents 275603a8e2ae
children bf896507faa9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
8045db05180b Initial revision
darius
parents:
diff changeset
1 #!/usr/bin/env python
8045db05180b Initial revision
darius
parents:
diff changeset
2
8045db05180b Initial revision
darius
parents:
diff changeset
3 ############################################################################
8045db05180b Initial revision
darius
parents:
diff changeset
4 # Screen scraper for Virgin Blue to look for happy hour deals
8045db05180b Initial revision
darius
parents:
diff changeset
5 #
8045db05180b Initial revision
darius
parents:
diff changeset
6 # Prints out (and emails) when criteria match based on cost,
8045db05180b Initial revision
darius
parents:
diff changeset
7 # destination, etc
8045db05180b Initial revision
darius
parents:
diff changeset
8 #
6
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
9 # $Id: scrape-vb.py,v 1.5 2007/08/29 07:37:59 darius Exp $
1
8045db05180b Initial revision
darius
parents:
diff changeset
10 ############################################################################
8045db05180b Initial revision
darius
parents:
diff changeset
11 #
8045db05180b Initial revision
darius
parents:
diff changeset
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved.
8045db05180b Initial revision
darius
parents:
diff changeset
13 #
8045db05180b Initial revision
darius
parents:
diff changeset
14 # Redistribution and use in source and binary forms, with or without
8045db05180b Initial revision
darius
parents:
diff changeset
15 # modification, are permitted provided that the following conditions
8045db05180b Initial revision
darius
parents:
diff changeset
16 # are met:
8045db05180b Initial revision
darius
parents:
diff changeset
17 # 1. Redistributions of source code must retain the above copyright
8045db05180b Initial revision
darius
parents:
diff changeset
18 # notice, this list of conditions and the following disclaimer.
8045db05180b Initial revision
darius
parents:
diff changeset
19 # 2. Redistributions in binary form must reproduce the above copyright
8045db05180b Initial revision
darius
parents:
diff changeset
20 # notice, this list of conditions and the following disclaimer in the
8045db05180b Initial revision
darius
parents:
diff changeset
21 # documentation and/or other materials provided with the distribution.
8045db05180b Initial revision
darius
parents:
diff changeset
22 #
8045db05180b Initial revision
darius
parents:
diff changeset
23 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
8045db05180b Initial revision
darius
parents:
diff changeset
24 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
8045db05180b Initial revision
darius
parents:
diff changeset
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
8045db05180b Initial revision
darius
parents:
diff changeset
26 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
8045db05180b Initial revision
darius
parents:
diff changeset
27 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
8045db05180b Initial revision
darius
parents:
diff changeset
28 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
8045db05180b Initial revision
darius
parents:
diff changeset
29 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
8045db05180b Initial revision
darius
parents:
diff changeset
30 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
8045db05180b Initial revision
darius
parents:
diff changeset
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
8045db05180b Initial revision
darius
parents:
diff changeset
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
8045db05180b Initial revision
darius
parents:
diff changeset
33 # SUCH DAMAGE.
8045db05180b Initial revision
darius
parents:
diff changeset
34 #
8045db05180b Initial revision
darius
parents:
diff changeset
35 ############################################################################
8045db05180b Initial revision
darius
parents:
diff changeset
36
6
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
37 import os, re, BeautifulSoup, datetime, time, smtplib, sys, urllib
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
38 import ConfigParser, optparse
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
39
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
40 usage = '''%prog [options]
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
41 Reads configuration from ./scrape-vb.ini and ~/.scrape-vb.ini'''
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
42
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
43 optparse = optparse.OptionParser(usage, version="$Id: scrape-vb.py,v 1.5 2007/08/29 07:37:59 darius Exp $")
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
44 optparse.add_option('-d', '--debug', action="store_true", default=False,
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
45 help="Disable mail sending, prints mail message to stdout")
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
46 optparse.add_option('-f', '--file', help="Do not fetch the page, use this file instead")
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
47 optparse.add_option('-e', '--example', action="store_true", default=False,
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
48 help="Print an example configuration file to stdout and exit")
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
49 (options, args) = optparse.parse_args()
1
8045db05180b Initial revision
darius
parents:
diff changeset
50
6
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
51 if (options.example):
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
52 print '''[global]
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
53 mailsubj="Subject line for emails"
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
54 # The following 3 options are necessary before email will be sent
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
55 mailfrom=user@host.com
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
56 mailsend=True
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
57 mailhost=mail.server.com
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
58
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
59 [user@host.com]
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
60 # All fields are optional
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
61 city1=Foo
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
62 city2=Bar
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
63 when=dd/mm/yy
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
64 maxcost=123
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
65 '''
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
66 sys.exit(0)
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
67
1
8045db05180b Initial revision
darius
parents:
diff changeset
68 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE)
8045db05180b Initial revision
darius
parents:
diff changeset
69 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE)
8045db05180b Initial revision
darius
parents:
diff changeset
70
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
71 conf = ConfigParser.ConfigParser()
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
72 conf.add_section('global')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
73 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
74 conf.set('global', 'vburl', 'http://virginblue.com.au')
5
275603a8e2ae Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents: 4
diff changeset
75
275603a8e2ae Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents: 4
diff changeset
76 conflist = ['scrape-vb.ini']
275603a8e2ae Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents: 4
diff changeset
77 if ('HOME' in os.environ):
275603a8e2ae Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents: 4
diff changeset
78 conflist.append(os.path.expanduser('~/.scrape-vb.ini'))
275603a8e2ae Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents: 4
diff changeset
79 conf.read(conflist)
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
80
1
8045db05180b Initial revision
darius
parents:
diff changeset
81 try:
6
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
82 if (options.file != None):
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
83 f = open(options.file)
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
84 else:
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
85 f = urllib.urlopen(conf.get('global', 'vburl'))
1
8045db05180b Initial revision
darius
parents:
diff changeset
86 except IOError, e:
8045db05180b Initial revision
darius
parents:
diff changeset
87 print "Unable to fetch page - " + str(e)
8045db05180b Initial revision
darius
parents:
diff changeset
88 sys.exit(1)
8045db05180b Initial revision
darius
parents:
diff changeset
89
8045db05180b Initial revision
darius
parents:
diff changeset
90 s = BeautifulSoup.BeautifulSoup(f)
8045db05180b Initial revision
darius
parents:
diff changeset
91 hrr = s.find("ul", "happyhr-rows")
8045db05180b Initial revision
darius
parents:
diff changeset
92 if (hrr == None):
8045db05180b Initial revision
darius
parents:
diff changeset
93 print "No happy hour details found"
8045db05180b Initial revision
darius
parents:
diff changeset
94 sys.exit(0)
8045db05180b Initial revision
darius
parents:
diff changeset
95
8045db05180b Initial revision
darius
parents:
diff changeset
96 hrlist = hrr.findAll("li")
8045db05180b Initial revision
darius
parents:
diff changeset
97
8045db05180b Initial revision
darius
parents:
diff changeset
98 # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it
8045db05180b Initial revision
darius
parents:
diff changeset
99 # doesn't work
8045db05180b Initial revision
darius
parents:
diff changeset
100 times = parsetper.match(s.findAll('ul')[11].find('li').string)
8045db05180b Initial revision
darius
parents:
diff changeset
101 if (times == None):
8045db05180b Initial revision
darius
parents:
diff changeset
102 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li'))
8045db05180b Initial revision
darius
parents:
diff changeset
103 sys.exit(0)
8045db05180b Initial revision
darius
parents:
diff changeset
104
8045db05180b Initial revision
darius
parents:
diff changeset
105 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3])
8045db05180b Initial revision
darius
parents:
diff changeset
106 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3])
8045db05180b Initial revision
darius
parents:
diff changeset
107
8045db05180b Initial revision
darius
parents:
diff changeset
108 output = {}
8045db05180b Initial revision
darius
parents:
diff changeset
109 for i in hrlist:
8045db05180b Initial revision
darius
parents:
diff changeset
110 href = i.find('a')
8045db05180b Initial revision
darius
parents:
diff changeset
111 match = parsetitle.match(href['title'])
8045db05180b Initial revision
darius
parents:
diff changeset
112 if (match == None):
8045db05180b Initial revision
darius
parents:
diff changeset
113 print "Unable to match " + str(s)
8045db05180b Initial revision
darius
parents:
diff changeset
114 continue
8045db05180b Initial revision
darius
parents:
diff changeset
115
8045db05180b Initial revision
darius
parents:
diff changeset
116 city1 = match.group(1)
8045db05180b Initial revision
darius
parents:
diff changeset
117 city2 = match.group(2)
8045db05180b Initial revision
darius
parents:
diff changeset
118 cost = int(match.group(3))
8045db05180b Initial revision
darius
parents:
diff changeset
119 url = href['href']
8045db05180b Initial revision
darius
parents:
diff changeset
120
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
121 for email in conf.sections():
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
122 if (email == 'global'):
1
8045db05180b Initial revision
darius
parents:
diff changeset
123 continue
6
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
124 # Stuff configuration into a dictionary for our convenience
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
125 t = {'email' : email}
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
126 for i in conf.items(email):
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
127 t[i[0]] = i[1]
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
128
1
8045db05180b Initial revision
darius
parents:
diff changeset
129 citymatch = True
8045db05180b Initial revision
darius
parents:
diff changeset
130 if ('city1' in t and 'city2' in t):
8045db05180b Initial revision
darius
parents:
diff changeset
131 if((t['city1'] != city1 or t['city2'] != city2) and
8045db05180b Initial revision
darius
parents:
diff changeset
132 (t['city1'] != city2 or t['city2'] != city1)):
8045db05180b Initial revision
darius
parents:
diff changeset
133 citymatch = False
8045db05180b Initial revision
darius
parents:
diff changeset
134 elif ('city1' in t):
8045db05180b Initial revision
darius
parents:
diff changeset
135 if (t['city1'] != city1 and t['city1'] != city2):
8045db05180b Initial revision
darius
parents:
diff changeset
136 citymatch = False
8045db05180b Initial revision
darius
parents:
diff changeset
137
8045db05180b Initial revision
darius
parents:
diff changeset
138 datematch = True
8045db05180b Initial revision
darius
parents:
diff changeset
139 if ('when' in t):
8045db05180b Initial revision
darius
parents:
diff changeset
140 travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3])
8045db05180b Initial revision
darius
parents:
diff changeset
141 if (travtime < frtime or travtime > totime):
8045db05180b Initial revision
darius
parents:
diff changeset
142 datematch = False
8045db05180b Initial revision
darius
parents:
diff changeset
143
8045db05180b Initial revision
darius
parents:
diff changeset
144 costmatch = True
8045db05180b Initial revision
darius
parents:
diff changeset
145 if ('maxcost' in t):
8045db05180b Initial revision
darius
parents:
diff changeset
146 if (cost > int(t['maxcost'])):
8045db05180b Initial revision
darius
parents:
diff changeset
147 costmatch = False
8045db05180b Initial revision
darius
parents:
diff changeset
148
8045db05180b Initial revision
darius
parents:
diff changeset
149 if (citymatch and datematch and costmatch):
8045db05180b Initial revision
darius
parents:
diff changeset
150 if (t['email'] not in output):
8045db05180b Initial revision
darius
parents:
diff changeset
151
8045db05180b Initial revision
darius
parents:
diff changeset
152 output[t['email']] = []
8045db05180b Initial revision
darius
parents:
diff changeset
153 output[t['email']].append([city1, city2, cost, url])
8045db05180b Initial revision
darius
parents:
diff changeset
154
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
155 try:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
156 mailsubj = conf.get('global', 'mailsubj')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
157 mailhost = conf.get('global', 'mailhost')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
158 mailsend = conf.getboolean('global', 'mailsend')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
159 mailfrom = conf.get('global', 'mailfrom')
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
160 except ConfigParser.NoOptionError:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
161 mailsend = False
6
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
162
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
163 if (options.debug == True and mailsend):
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
164 print "mailsend overridden due to debugging"
9f3eb9a07966 Add config parser.
darius
parents: 5
diff changeset
165 mailsend = False
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
166
1
8045db05180b Initial revision
darius
parents:
diff changeset
167 if (mailsend):
8045db05180b Initial revision
darius
parents:
diff changeset
168 server = smtplib.SMTP(mailhost)
8045db05180b Initial revision
darius
parents:
diff changeset
169 #server.set_debuglevel(1)
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
170 else:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
171 print "Note: Mail sending disabled"
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
172
1
8045db05180b Initial revision
darius
parents:
diff changeset
173 for o in output:
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
174 if (mailsend):
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
175 msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj))
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
176 msg = msg + "Your criteria for flights have been matched\r\n\r\n"
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
177 else:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
178 print "Match for " + o
1
8045db05180b Initial revision
darius
parents:
diff changeset
179 for i in output[o]:
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
180 if (mailsend):
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
181 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3])
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
182 else:
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
183 print "%s <-> %s costs $%d" % (i[0], i[1], i[2])
1
8045db05180b Initial revision
darius
parents:
diff changeset
184
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
185 ttimestr = "Note: travel period is from %s to %s" % \
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
186 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y"))
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
187
1
8045db05180b Initial revision
darius
parents:
diff changeset
188 if (mailsend):
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
189 msg = msg + "\r\n" + ttimestr + "\r\n"
1
8045db05180b Initial revision
darius
parents:
diff changeset
190 server.sendmail(mailfrom, o, msg)
8045db05180b Initial revision
darius
parents:
diff changeset
191 else:
3
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
192 print ttimestr
89232ea0c3d4 Read configuration from an ini file rather than hard coding it in the
darius
parents: 1
diff changeset
193 print