Mercurial > ~darius > hgwebdir.cgi > scrape-vb
annotate scrape-vb.py @ 5:275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
author | darius |
---|---|
date | Tue, 28 Aug 2007 02:58:50 +0000 |
parents | e3f4ef0b6e39 |
children | 9f3eb9a07966 |
rev | line source |
---|---|
1 | 1 #!/usr/bin/env python |
2 | |
3 ############################################################################ | |
4 # Screen scraper for Virgin Blue to look for happy hour deals | |
5 # | |
6 # Prints out (and emails) when criteria match based on cost, | |
7 # destination, etc | |
8 # | |
5
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
9 # $Id: scrape-vb.py,v 1.4 2007/08/28 02:58:50 darius Exp $ |
1 | 10 ############################################################################ |
11 # | |
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | |
13 # | |
14 # Redistribution and use in source and binary forms, with or without | |
15 # modification, are permitted provided that the following conditions | |
16 # are met: | |
17 # 1. Redistributions of source code must retain the above copyright | |
18 # notice, this list of conditions and the following disclaimer. | |
19 # 2. Redistributions in binary form must reproduce the above copyright | |
20 # notice, this list of conditions and the following disclaimer in the | |
21 # documentation and/or other materials provided with the distribution. | |
22 # | |
23 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
24 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
26 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE | |
27 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
28 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
29 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
30 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
33 # SUCH DAMAGE. | |
34 # | |
35 ############################################################################ | |
36 | |
5
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
37 import os, re, BeautifulSoup, datetime, time, smtplib, sys, urllib, ConfigParser |
1 | 38 |
39 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) | |
40 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) | |
41 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
42 conf = ConfigParser.ConfigParser() |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
43 conf.add_section('global') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
44 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
45 conf.set('global', 'vburl', 'http://virginblue.com.au') |
5
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
46 |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
47 conflist = ['scrape-vb.ini'] |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
48 if ('HOME' in os.environ): |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
49 conflist.append(os.path.expanduser('~/.scrape-vb.ini')) |
275603a8e2ae
Read config file from $HOME as well as CWD (use dotfile for $HOME)
darius
parents:
4
diff
changeset
|
50 conf.read(conflist) |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
51 |
1 | 52 try: |
53 #f = open("vb-happyhour.html") | |
4 | 54 f = urllib.urlopen(conf.get('global', 'vburl')) |
1 | 55 except IOError, e: |
56 print "Unable to fetch page - " + str(e) | |
57 sys.exit(1) | |
58 | |
59 s = BeautifulSoup.BeautifulSoup(f) | |
60 hrr = s.find("ul", "happyhr-rows") | |
61 if (hrr == None): | |
62 print "No happy hour details found" | |
63 sys.exit(0) | |
64 | |
65 hrlist = hrr.findAll("li") | |
66 | |
67 # XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it | |
68 # doesn't work | |
69 times = parsetper.match(s.findAll('ul')[11].find('li').string) | |
70 if (times == None): | |
71 print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li')) | |
72 sys.exit(0) | |
73 | |
74 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) | |
75 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) | |
76 | |
77 output = {} | |
78 for i in hrlist: | |
79 href = i.find('a') | |
80 match = parsetitle.match(href['title']) | |
81 if (match == None): | |
82 print "Unable to match " + str(s) | |
83 continue | |
84 | |
85 city1 = match.group(1) | |
86 city2 = match.group(2) | |
87 cost = int(match.group(3)) | |
88 url = href['href'] | |
89 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
90 for email in conf.sections(): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
91 if (email == 'global'): |
1 | 92 continue |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
93 |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
94 t = {'email' : email} |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
95 for i in conf.items(email): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
96 t[i[0]] = i[1] |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
97 |
1 | 98 citymatch = True |
99 if ('city1' in t and 'city2' in t): | |
100 if((t['city1'] != city1 or t['city2'] != city2) and | |
101 (t['city1'] != city2 or t['city2'] != city1)): | |
102 citymatch = False | |
103 elif ('city1' in t): | |
104 if (t['city1'] != city1 and t['city1'] != city2): | |
105 citymatch = False | |
106 | |
107 datematch = True | |
108 if ('when' in t): | |
109 travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3]) | |
110 if (travtime < frtime or travtime > totime): | |
111 datematch = False | |
112 | |
113 costmatch = True | |
114 if ('maxcost' in t): | |
115 if (cost > int(t['maxcost'])): | |
116 costmatch = False | |
117 | |
118 if (citymatch and datematch and costmatch): | |
119 if (t['email'] not in output): | |
120 | |
121 output[t['email']] = [] | |
122 output[t['email']].append([city1, city2, cost, url]) | |
123 | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
124 try: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
125 mailsubj = conf.get('global', 'mailsubj') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
126 mailhost = conf.get('global', 'mailhost') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
127 mailsend = conf.getboolean('global', 'mailsend') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
128 mailfrom = conf.get('global', 'mailfrom') |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
129 except ConfigParser.NoOptionError: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
130 mailsend = False |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
131 |
1 | 132 if (mailsend): |
133 server = smtplib.SMTP(mailhost) | |
134 #server.set_debuglevel(1) | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
135 else: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
136 print "Note: Mail sending disabled" |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
137 |
1 | 138 for o in output: |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
139 if (mailsend): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
140 msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)) |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
141 msg = msg + "Your criteria for flights have been matched\r\n\r\n" |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
142 else: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
143 print "Match for " + o |
1 | 144 for i in output[o]: |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
145 if (mailsend): |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
146 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
147 else: |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
148 print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) |
1 | 149 |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
150 ttimestr = "Note: travel period is from %s to %s" % \ |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
151 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
152 |
1 | 153 if (mailsend): |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
154 msg = msg + "\r\n" + ttimestr + "\r\n" |
1 | 155 server.sendmail(mailfrom, o, msg) |
156 else: | |
3
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
157 print ttimestr |
89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
darius
parents:
1
diff
changeset
|
158 print |