Mercurial > ~darius > hgwebdir.cgi > scrape-vb
annotate scrape-vb.py @ 6:9f3eb9a07966
Add config parser.
Add ability to print out an example config file.
Add -f option to read a saved page.
author | darius |
---|---|
date | Wed, 29 Aug 2007 07:37:59 +0000 |
parents | 275603a8e2ae |
children | bf896507faa9 |
rev | line source |
---|---|
1 | 1 #!/usr/bin/env python |
2 | |
3 ############################################################################ | |
4 # Screen scraper for Virgin Blue to look for happy hour deals | |
5 # | |
6 # Prints out (and emails) when criteria match based on cost, | |
7 # destination, etc | |
8 # | |
6 | 9 # $Id: scrape-vb.py,v 1.5 2007/08/29 07:37:59 darius Exp $ |
1 | 10 ############################################################################ |
11 # | |
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | |
13 # | |
14 # Redistribution and use in source and binary forms, with or without | |
15 # modification, are permitted provided that the following conditions | |
16 # are met: | |
17 # 1. Redistributions of source code must retain the above copyright | |
18 # notice, this list of conditions and the following disclaimer. | |
19 # 2. Redistributions in binary form must reproduce the above copyright | |
20 # notice, this list of conditions and the following disclaimer in the | |
21 # documentation and/or other materials provided with the distribution. | |
22 # | |
23 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
24 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
26 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE | |
27 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
28 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
29 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
30 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
33 # SUCH DAMAGE. | |
34 # | |
35 ############################################################################ | |
36 | |
6 | 37 import os, re, BeautifulSoup, datetime, time, smtplib, sys, urllib |
38 import ConfigParser, optparse | |
39 | |
40 usage = '''%prog [options] | |
41 Reads configuration from ./scrape-vb.ini and ~/.scrape-vb.ini''' | |
42 | |
43 optparse = optparse.OptionParser(usage, version="$Id: scrape-vb.py,v 1.5 2007/08/29 07:37:59 darius Exp $") | |
44 optparse.add_option('-d', '--debug', action="store_true", default=False, | |
45 help="Disable mail sending, prints mail message to stdout") | |
46 optparse.add_option('-f', '--file', help="Do not fetch the page, use this file instead") | |
47 optparse.add_option('-e', '--example', action="store_true", default=False, | |
48 help="Print an example configuration file to stdout and exit") | |
49 (options, args) = optparse.parse_args() | |
1 | 50 |
6 | 51 if (options.example): |
52 print '''[global] | |
53 mailsubj="Subject line for emails" | |
54 # The following 3 options are necessary before email will be sent | |
55 mailfrom=user@host.com | |
56 mailsend=True | |
57 mailhost=mail.server.com | |
58 | |
59 [user@host.com] | |
60 # All fields are optional | |
61 city1=Foo | |
62 city2=Bar | |
63 when=dd/mm/yy | |
64 maxcost=123 | |
65 ''' | |
66 sys.exit(0) | |
67 | |
1 | 68 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) |
69 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) | |
70 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
71 conf = ConfigParser.ConfigParser() |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
72 conf.add_section('global') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
73 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
74 conf.set('global', 'vburl', 'http://virginblue.com.au') |
5
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
75 |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
76 conflist = ['scrape-vb.ini'] |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
77 if ('HOME' in os.environ): |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
78 conflist.append(os.path.expanduser('~/.scrape-vb.ini')) |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
79 conf.read(conflist) |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
80 |
1 | 81 try: |
6 | 82 if (options.file != None): |
83 f = open(options.file) | |
84 else: | |
85 f = urllib.urlopen(conf.get('global', 'vburl')) | |
1 | 86 except IOError, e: |
87 print "Unable to fetch page - " + str(e) | |
88 sys.exit(1) | |
89 | |
90 s = BeautifulSoup.BeautifulSoup(f) | |
91 hrr = s.find("ul", "happyhr-rows") | |
92 if (hrr == None): | |
93 print "No happy hour details found" | |
94 sys.exit(0) | |
95 | |
96 hrlist = hrr.findAll("li") | |
97 | |
98 # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it | |
99 # doesn't work | |
100 times = parsetper.match(s.findAll('ul')[11].find('li').string) | |
101 if (times == None): | |
102 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) | |
103 sys.exit(0) | |
104 | |
105 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) | |
106 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) | |
107 | |
108 output = {} | |
109 for i in hrlist: | |
110 href = i.find('a') | |
111 match = parsetitle.match(href['title']) | |
112 if (match == None): | |
113 print "Unable to match " + str(s) | |
114 continue | |
115 | |
116 city1 = match.group(1) | |
117 city2 = match.group(2) | |
118 cost = int(match.group(3)) | |
119 url = href['href'] | |
120 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
121 for email in conf.sections(): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
122 if (email == 'global'): |
1 | 123 continue |
6 | 124 # Stuff configuration into a dictionary for our convenience |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
125 t = {'email' : email} |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
126 for i in conf.items(email): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
127 t[i[0]] = i[1] |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
128 |
1 | 129 citymatch = True |
130 if ('city1' in t and 'city2' in t): | |
131 if((t['city1'] != city1 or t['city2'] != city2) and | |
132 (t['city1'] != city2 or t['city2'] != city1)): | |
133 citymatch = False | |
134 elif ('city1' in t): | |
135 if (t['city1'] != city1 and t['city1'] != city2): | |
136 citymatch = False | |
137 | |
138 datematch = True | |
139 if ('when' in t): | |
140 travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) | |
141 if (travtime < frtime or travtime > totime): | |
142 datematch = False | |
143 | |
144 costmatch = True | |
145 if ('maxcost' in t): | |
146 if (cost > int(t['maxcost'])): | |
147 costmatch = False | |
148 | |
149 if (citymatch and datematch and costmatch): | |
150 if (t['email'] not in output): | |
151 | |
152 output[t['email']] = [] | |
153 output[t['email']].append([city1, city2, cost, url]) | |
154 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
155 try: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
156 mailsubj = conf.get('global', 'mailsubj') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
157 mailhost = conf.get('global', 'mailhost') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
158 mailsend = conf.getboolean('global', 'mailsend') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
159 mailfrom = conf.get('global', 'mailfrom') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
160 except ConfigParser.NoOptionError: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
161 mailsend = False |
6 | 162 |
163 if (options.debug == True and mailsend): | |
164 print "mailsend overridden due to debugging" | |
165 mailsend = False | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
166 |
1 | 167 if (mailsend): |
168 server = smtplib.SMTP(mailhost) | |
169 #server.set_debuglevel(1) | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
170 else: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
171 print "Note: Mail sending disabled" |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
172 |
1 | 173 for o in output: |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
174 if (mailsend): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
175 msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)) |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
176 msg = msg + "Your criteria for flights have been matched\r\n\r\n" |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
177 else: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
178 print "Match for " + o |
1 | 179 for i in output[o]: |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
180 if (mailsend): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
181 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
182 else: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
183 print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) |
1 | 184 |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
185 ttimestr = "Note: travel period is from %s to %s" % \ |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
186 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
187 |
1 | 188 if (mailsend): |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
189 msg = msg + "\r\n" + ttimestr + "\r\n" |
1 | 190 server.sendmail(mailfrom, o, msg) |
191 else: | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
192 print ttimestr |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
193 print |