Mercurial > ~darius > hgwebdir.cgi > scrape-vb
annotate scrape-vb.py @ 4:e3f4ef0b6e39
Oops, read URL from configuration like I planned.
author | darius |
---|---|
date | Mon, 27 Aug 2007 02:29:27 +0000 |
parents | 89232ea0c3d4 |
children | 275603a8e2ae |
rev | line source |
---|---|
1 | 1 #!/usr/bin/env python |
2 | |
3 ############################################################################ | |
4 # Screen scraper for Virgin Blue to look for happy hour deals | |
5 # | |
6 # Prints out (and emails) when criteria match based on cost, | |
7 # destination, etc | |
8 # | |
4 | 9 # $Id: scrape-vb.py,v 1.3 2007/08/27 02:29:27 darius Exp $ |
1 | 10 ############################################################################ |
11 # | |
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | |
13 # | |
14 # Redistribution and use in source and binary forms, with or without | |
15 # modification, are permitted provided that the following conditions | |
16 # are met: | |
17 # 1. Redistributions of source code must retain the above copyright | |
18 # notice, this list of conditions and the following disclaimer. | |
19 # 2. Redistributions in binary form must reproduce the above copyright | |
20 # notice, this list of conditions and the following disclaimer in the | |
21 # documentation and/or other materials provided with the distribution. | |
22 # | |
23 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
24 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
26 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE | |
27 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
28 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
29 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
30 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
33 # SUCH DAMAGE. | |
34 # | |
35 ############################################################################ | |
36 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
37 import re, BeautifulSoup, datetime, time, smtplib, sys, urllib, ConfigParser |
1 | 38 |
39 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) | |
40 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) | |
41 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
42 conf = ConfigParser.ConfigParser() |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
43 conf.add_section('global') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
44 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
45 conf.set('global', 'vburl', 'http://virginblue.com.au') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
46 conf.read('scrape-vb.ini') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
47 |
1 | 48 try: |
49 #f = open("vb-happyhour.html") | |
4 | 50 f = urllib.urlopen(conf.get('global', 'vburl')) |
1 | 51 except IOError, e: |
52 print "Unable to fetch page - " + str(e) | |
53 sys.exit(1) | |
54 | |
55 s = BeautifulSoup.BeautifulSoup(f) | |
56 hrr = s.find("ul", "happyhr-rows") | |
57 if (hrr == None): | |
58 print "No happy hour details found" | |
59 sys.exit(0) | |
60 | |
61 hrlist = hrr.findAll("li") | |
62 | |
63 # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it | |
64 # doesn't work | |
65 times = parsetper.match(s.findAll('ul')[11].find('li').string) | |
66 if (times == None): | |
67 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) | |
68 sys.exit(0) | |
69 | |
70 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) | |
71 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) | |
72 | |
73 output = {} | |
74 for i in hrlist: | |
75 href = i.find('a') | |
76 match = parsetitle.match(href['title']) | |
77 if (match == None): | |
78 print "Unable to match " + str(s) | |
79 continue | |
80 | |
81 city1 = match.group(1) | |
82 city2 = match.group(2) | |
83 cost = int(match.group(3)) | |
84 url = href['href'] | |
85 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
86 for email in conf.sections(): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
87 if (email == 'global'): |
1 | 88 continue |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
89 |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
90 t = {'email' : email} |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
91 for i in conf.items(email): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
92 t[i[0]] = i[1] |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
93 |
1 | 94 citymatch = True |
95 if ('city1' in t and 'city2' in t): | |
96 if((t['city1'] != city1 or t['city2'] != city2) and | |
97 (t['city1'] != city2 or t['city2'] != city1)): | |
98 citymatch = False | |
99 elif ('city1' in t): | |
100 if (t['city1'] != city1 and t['city1'] != city2): | |
101 citymatch = False | |
102 | |
103 datematch = True | |
104 if ('when' in t): | |
105 travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) | |
106 if (travtime < frtime or travtime > totime): | |
107 datematch = False | |
108 | |
109 costmatch = True | |
110 if ('maxcost' in t): | |
111 if (cost > int(t['maxcost'])): | |
112 costmatch = False | |
113 | |
114 if (citymatch and datematch and costmatch): | |
115 if (t['email'] not in output): | |
116 | |
117 output[t['email']] = [] | |
118 output[t['email']].append([city1, city2, cost, url]) | |
119 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
120 try: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
121 mailsubj = conf.get('global', 'mailsubj') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
122 mailhost = conf.get('global', 'mailhost') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
123 mailsend = conf.getboolean('global', 'mailsend') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
124 mailfrom = conf.get('global', 'mailfrom') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
125 except ConfigParser.NoOptionError: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
126 mailsend = False |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
127 |
1 | 128 if (mailsend): |
129 server = smtplib.SMTP(mailhost) | |
130 #server.set_debuglevel(1) | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
131 else: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
132 print "Note: Mail sending disabled" |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
133 |
1 | 134 for o in output: |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
135 if (mailsend): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
136 msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)) |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
137 msg = msg + "Your criteria for flights have been matched\r\n\r\n" |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
138 else: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
139 print "Match for " + o |
1 | 140 for i in output[o]: |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
141 if (mailsend): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
142 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
143 else: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
144 print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) |
1 | 145 |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
146 ttimestr = "Note: travel period is from %s to %s" % \ |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
147 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
148 |
1 | 149 if (mailsend): |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
150 msg = msg + "\r\n" + ttimestr + "\r\n" |
1 | 151 server.sendmail(mailfrom, o, msg) |
152 else: | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
153 print ttimestr |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
154 print |