diff scrape-vb.py @ 1:8045db05180b SCRAPEVB_1_0

Initial revision
author darius
date Sat, 25 Aug 2007 05:17:29 +0000 (2007-08-25)
parents
children 89232ea0c3d4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scrape-vb.py	Sat Aug 25 05:17:29 2007 +0000
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+
+############################################################################
+# Screen scraper for Virgin Blue to look for happy hour deals
+#
+# Prints out (and emails) when criteria match based on cost,
+# destination, etc
+#
+# $Id: scrape-vb.py,v 1.1.1.1 2007/08/25 05:17:29 darius Exp $
+############################################################################
+#
+# Copyright (C) 2007 Daniel O'Connor. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+############################################################################
+
+import re, BeautifulSoup, datetime, time, smtplib, sys, urllib
+
+#### Configuration
+
+### Travel criteria
+# Supported keys are email, when, city1, city2, maxcost
+# email is mandatory. If city2 is not present either city will be
+# matched. when and maxcost are optional (will match for any date or
+# cost)
+travellers = [
+    { 'email' : 'darius@dons.net.au', 'city1' : 'Sydney' },
+    { 'email' : 'sarah.mahoney@nehta.gov.au', 'city1' : 'Adelaide', 'city2' : 'Brisbane' },
+  ]
+
+### Mail host
+mailhost = 'mail.dons.net.au'
+
+### Who the email is from
+mailfrom = 'darius@dons.net.au'
+
+### What's onn the subject linee
+mailsubj = 'Virgin Blue Happy Hour Deals'
+
+### Actually send email?
+mailsend = False
+
+### URL to parse
+vburl = 'http://virginblue.com.au'
+
+parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE)
+parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE)
+
+try:
+    #f = open("vb-happyhour.html")
+    f = urllib.urlopen(vburl)
+except IOError, e:
+    print  "Unable to fetch page - " + str(e)
+    sys.exit(1)
+    
+s = BeautifulSoup.BeautifulSoup(f)
+hrr = s.find("ul", "happyhr-rows")
+if (hrr == None):
+    print "No happy hour details found"
+    sys.exit(0)
+    
+hrlist = hrr.findAll("li")
+
+# XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it
+# doesn't work
+times = parsetper.match(s.findAll('ul')[11].find('li').string)
+if (times == None):
+    print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li'))
+    sys.exit(0)
+    
+frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3])
+totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3])
+
+#print "Travel from %s to %s" % (str(frtime), str(totime))
+
+output = {}
+for i in hrlist:
+    href =  i.find('a')
+    match = parsetitle.match(href['title'])
+    if (match == None):
+        print "Unable to match " + str(s)
+        continue
+
+    city1 = match.group(1)
+    city2 = match.group(2)
+    cost = int(match.group(3))
+    url = href['href']
+    
+    for t in travellers:
+        if ('email' not in t):
+            print "No email key found, configuration error?"
+            continue
+        
+        citymatch = True
+        if ('city1' in t and 'city2' in t):
+            if((t['city1'] != city1 or t['city2'] != city2) and
+               (t['city1'] != city2 or t['city2'] != city1)):
+                   citymatch = False
+        elif ('city1' in t):
+            if (t['city1'] != city1 and t['city1'] != city2):
+                citymatch = False
+            
+        datematch = True
+        if ('when' in t):
+            travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3])
+            if (travtime < frtime or travtime > totime):
+                datematch = False
+
+        costmatch = True
+        if ('maxcost' in t):
+            if (cost > int(t['maxcost'])):
+                costmatch = False
+                
+        if (citymatch and datematch and costmatch):
+            if (t['email'] not in output):
+                
+                output[t['email']] = []
+            output[t['email']].append([city1, city2, cost, url])
+
+if (mailsend):
+    server = smtplib.SMTP(mailhost)
+    #server.set_debuglevel(1)
+
+for o in output:
+    msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj))
+    msg = msg + "Your criteria for flights have been matched\r\n\r\n"
+    print "Sending email to " + o
+    for i in output[o]:
+        print "%s <-> %s costs $%d" % (i[0], i[1], i[2])
+        msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3])
+
+    msg = msg + "\r\nNote: travel period is from %s to %s" % \
+                 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y"))
+    if (mailsend):
+        server.sendmail(mailfrom, o, msg)
+    else:
+        print msg
+    print