Mercurial > ~darius > hgwebdir.cgi > scrape-vb
comparison scrape-vb.py @ 3:89232ea0c3d4
Read configuration from an ini file rather than hard coding it in the
source.
author | darius |
---|---|
date | Mon, 27 Aug 2007 01:42:11 +0000 |
parents | 8045db05180b |
children | e3f4ef0b6e39 |
comparison
equal
deleted
inserted
replaced
2:a28ea6e01650 | 3:89232ea0c3d4 |
---|---|
4 # Screen scraper for Virgin Blue to look for happy hour deals | 4 # Screen scraper for Virgin Blue to look for happy hour deals |
5 # | 5 # |
6 # Prints out (and emails) when criteria match based on cost, | 6 # Prints out (and emails) when criteria match based on cost, |
7 # destination, etc | 7 # destination, etc |
8 # | 8 # |
9 # $Id: scrape-vb.py,v 1.1.1.1 2007/08/25 05:17:29 darius Exp $ | 9 # $Id: scrape-vb.py,v 1.2 2007/08/27 01:42:11 darius Exp $ |
10 ############################################################################ | 10 ############################################################################ |
11 # | 11 # |
12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | 12 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. |
13 # | 13 # |
14 # Redistribution and use in source and binary forms, with or without | 14 # Redistribution and use in source and binary forms, with or without |
32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 32 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
33 # SUCH DAMAGE. | 33 # SUCH DAMAGE. |
34 # | 34 # |
35 ############################################################################ | 35 ############################################################################ |
36 | 36 |
37 import re, BeautifulSoup, datetime, time, smtplib, sys, urllib | 37 import re, BeautifulSoup, datetime, time, smtplib, sys, urllib, ConfigParser |
38 | |
39 #### Configuration | |
40 | |
41 ### Travel criteria | |
42 # Supported keys are email, when, city1, city2, maxcost | |
43 # email is mandatory. If city2 is not present either city will be | |
44 # matched. when and maxcost are optional (will match for any date or | |
45 # cost) | |
46 travellers = [ | |
47 { 'email' : 'darius@dons.net.au', 'city1' : 'Sydney' }, | |
48 { 'email' : 'sarah.mahoney@nehta.gov.au', 'city1' : 'Adelaide', 'city2' : 'Brisbane' }, | |
49 ] | |
50 | |
51 ### Mail host | |
52 mailhost = 'mail.dons.net.au' | |
53 | |
54 ### Who the email is from | |
55 mailfrom = 'darius@dons.net.au' | |
56 | |
57 ### What's onn the subject linee | |
58 mailsubj = 'Virgin Blue Happy Hour Deals' | |
59 | |
60 ### Actually send email? | |
61 mailsend = False | |
62 | |
63 ### URL to parse | |
64 vburl = 'http://virginblue.com.au' | |
65 | 38 |
66 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) | 39 parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE) |
67 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) | 40 parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE) |
41 | |
42 conf = ConfigParser.ConfigParser() | |
43 conf.add_section('global') | |
44 conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals') | |
45 conf.set('global', 'vburl', 'http://virginblue.com.au') | |
46 conf.read('scrape-vb.ini') | |
68 | 47 |
69 try: | 48 try: |
70 #f = open("vb-happyhour.html") | 49 #f = open("vb-happyhour.html") |
71 f = urllib.urlopen(vburl) | 50 f = urllib.urlopen(vburl) |
72 except IOError, e: | 51 except IOError, e: |
89 sys.exit(0) | 68 sys.exit(0) |
90 | 69 |
91 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) | 70 frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3]) |
92 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) | 71 totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3]) |
93 | 72 |
94 #print "Travel from %s to %s" % (str(frtime), str(totime)) | |
95 | |
96 output = {} | 73 output = {} |
97 for i in hrlist: | 74 for i in hrlist: |
98 href = i.find('a') | 75 href = i.find('a') |
99 match = parsetitle.match(href['title']) | 76 match = parsetitle.match(href['title']) |
100 if (match == None): | 77 if (match == None): |
104 city1 = match.group(1) | 81 city1 = match.group(1) |
105 city2 = match.group(2) | 82 city2 = match.group(2) |
106 cost = int(match.group(3)) | 83 cost = int(match.group(3)) |
107 url = href['href'] | 84 url = href['href'] |
108 | 85 |
109 for t in travellers: | 86 for email in conf.sections(): |
110 if ('email' not in t): | 87 if (email == 'global'): |
111 print "No email key found, configuration error?" | |
112 continue | 88 continue |
113 | 89 |
90 t = {'email' : email} | |
91 for i in conf.items(email): | |
92 t[i[0]] = i[1] | |
93 | |
114 citymatch = True | 94 citymatch = True |
115 if ('city1' in t and 'city2' in t): | 95 if ('city1' in t and 'city2' in t): |
116 if((t['city1'] != city1 or t['city2'] != city2) and | 96 if((t['city1'] != city1 or t['city2'] != city2) and |
117 (t['city1'] != city2 or t['city2'] != city1)): | 97 (t['city1'] != city2 or t['city2'] != city1)): |
118 citymatch = False | 98 citymatch = False |
135 if (t['email'] not in output): | 115 if (t['email'] not in output): |
136 | 116 |
137 output[t['email']] = [] | 117 output[t['email']] = [] |
138 output[t['email']].append([city1, city2, cost, url]) | 118 output[t['email']].append([city1, city2, cost, url]) |
139 | 119 |
120 try: | |
121 mailsubj = conf.get('global', 'mailsubj') | |
122 mailhost = conf.get('global', 'mailhost') | |
123 mailsend = conf.getboolean('global', 'mailsend') | |
124 mailfrom = conf.get('global', 'mailfrom') | |
125 except ConfigParser.NoOptionError: | |
126 mailsend = False | |
127 | |
140 if (mailsend): | 128 if (mailsend): |
141 server = smtplib.SMTP(mailhost) | 129 server = smtplib.SMTP(mailhost) |
142 #server.set_debuglevel(1) | 130 #server.set_debuglevel(1) |
131 else: | |
132 print "Note: Mail sending disabled" | |
133 | |
134 for o in output: | |
135 if (mailsend): | |
136 msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)) | |
137 msg = msg + "Your criteria for flights have been matched\r\n\r\n" | |
138 else: | |
139 print "Match for " + o | |
140 for i in output[o]: | |
141 if (mailsend): | |
142 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) | |
143 else: | |
144 print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) | |
143 | 145 |
144 for o in output: | 146 ttimestr = "Note: travel period is from %s to %s" % \ |
145 msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)) | 147 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) |
146 msg = msg + "Your criteria for flights have been matched\r\n\r\n" | |
147 print "Sending email to " + o | |
148 for i in output[o]: | |
149 print "%s <-> %s costs $%d" % (i[0], i[1], i[2]) | |
150 msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3]) | |
151 | 148 |
152 msg = msg + "\r\nNote: travel period is from %s to %s" % \ | |
153 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y")) | |
154 if (mailsend): | 149 if (mailsend): |
150 msg = msg + "\r\n" + ttimestr + "\r\n" | |
155 server.sendmail(mailfrom, o, msg) | 151 server.sendmail(mailfrom, o, msg) |
156 else: | 152 else: |
157 print msg | 153 print ttimestr |
158 print | 154 print |