Mercurial > ~darius > hgwebdir.cgi > scrape-gm
view scrape-gm.py @ 13:e8550290e512
Update parser for new format.
author | darius@Inchoate |
---|---|
date | Sun, 14 Dec 2008 18:48:15 +1030 |
parents | ae9e833e4447 |
children |
line wrap: on
line source
#!/usr/bin/env python ############################################################################ # Screen scraper for game-monitor.com # # Prints out matched player names agreated by server # ############################################################################ # # Copyright (C) 2008 Daniel O'Connor. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # ############################################################################ import re, time, datetime, urllib, sys, BeautifulSoup debug = False class Server: alltags = re.compile('<[^>]*>') vwhttags = re.compile('<(br|hr)>') hwhttags = re.compile('\ ') typetag = re.compile('<td><a href="/GameSearch/([^/]+)/.*</td>') def __init__(self, description = "", ip = "", port = 0, mapname = "", updateage = 0, numplayers = 0, maxplayers = 0, players = []): self.description = description self.ip = ip self.port = port self.mapname = mapname self.updateage = int(updateage) self.players = [] self.numplayers = numplayers self.maxplayers = maxplayers def __init__(self, pcols, scols): # pcols[1] = Player name # pcols[2] = Server description # scols[0] = Players in server / max players # scols[1] = Server IP & port # scols[2] = Map name # scols[3] = Game type # scols[8] = Update age if debug: print "pcols = " + str(pcols) print "scols = " + str(scols) self.pcountre = re.compile("([0-9]+)/([0-9]+)") self.ipportre = re.compile("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+):([0-9]+)") self.sdesc = re.compile(" +[0-9]+\. +(.*)") m = self.sdesc.match(pcols[2]) if (m == None): raise SyntaxError self.description = m.group(1) m = self.ipportre.match(scols[1]) if (m == None): raise SyntaxError self.ip = m.group(1) self.port = int(m.group(2)) self.gametype = scols[3] self.mapname = scols[2] self.updateage = scols[8] m = self.pcountre.match(scols[0]) if (m == None): raise SyntaxError self.numplayers = int(m.group(1)) self.maxplayers = int(m.group(2)) self.players = [] def __str__(self): plist = "" for p in self.players: plist = plist + " " + str(p) return "%s: %s (%s:%d) | Map: %s | Players: %d/%d : %s (%s old)" % \ (self.gametype, self.description, self.ip, self.port, self.mapname, self.numplayers, self.maxplayers, plist, self.updateage) def GetTuple(scols): return str(scols[2]) + ":" + str(scols[3]) GetTuple = staticmethod(GetTuple) def FixTags(s): # Mangle game type t = Server.typetag.match(s) if t != None: s = t.group(1) s = re.sub(Server.vwhttags, '\n', s) s = re.sub(Server.hwhttags, '', s) s = str(BeautifulSoup.BeautifulStoneSoup( \ s, convertEntities = BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES)) s = re.sub(Server.alltags, '', s) return(s) FixTags = staticmethod(FixTags) def Scrape(handle): s = BeautifulSoup.BeautifulSoup(handle) playertbl = s.find("table", "results") if (playertbl == None): if True: print "Unable to find results" return None servertbl = playertbl.findNext("table") playerrows = playertbl.findAll("tr") serverrows = servertbl.findAll("tr") if (len(playerrows) != len(serverrows)): print "Internal error 41223" return servers = {} for i in range(len(playerrows[1:])): pcols = playerrows[i].findAll('td') scols = serverrows[i].findAll('td') if (len(pcols) != 3): if debug: print "pcols has length %d, expected 3" % len(pcols) continue pcols = map(lambda c : Server.FixTags(str(c)), pcols) scols = map(lambda c : Server.FixTags(str(c)), scols) stuple = Server.GetTuple(scols) if (stuple not in servers): s = Server(pcols, scols) servers[stuple] = s servers[stuple].addplayer(pcols[1]) return servers Scrape = staticmethod(Scrape) def addplayer(self, pname): self.players.append(pname) if True: maxhits = 10 if (len(sys.argv) < 2): print "Bad usage" print sys.argv[0] + "search_string" sys.exit(1) try: #f = open("gm.html") f = urllib.urlopen("http://www.game-monitor.com/search.php?location=AU&search=" + urllib.quote(sys.argv[1]) + "&type=player&location=AU") except IOError, e: print "Unable to fetch page - " + str(e) sys.exit(0) servers = Server.Scrape(f) del f if (servers == None): print "No results available, please check manually" elif (len(servers) == 0): print "No players found" else: tmp = [] for i in servers: tmp.append(servers[i]) tmp.sort() i = 0 for s in tmp: i = i + 1 print s if (i >= maxhits): print "*** Stopping after " + str(maxhits) + " hits" break