comparison scrape-gm.py @ 15:789cf10ce4c9

Update for new format (for sure)
author darius@Inchoate
date Sun, 14 Dec 2008 18:55:39 +1030
parents 5058c2695109
children eeee17d2072c
comparison
equal deleted inserted replaced
14:eec2fc32ca88 15:789cf10ce4c9
3 ############################################################################ 3 ############################################################################
4 # Screen scraper for game-monitor.com 4 # Screen scraper for game-monitor.com
5 # 5 #
6 # Prints out matched player names agreated by server 6 # Prints out matched player names agreated by server
7 # 7 #
8 # $Id: scrape-gm.py,v 1.3 2007/11/18 08:54:07 darius Exp $
9 ############################################################################ 8 ############################################################################
10 # 9 #
11 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. 10 # Copyright (C) 2008 Daniel O'Connor. All rights reserved.
12 # 11 #
13 # Redistribution and use in source and binary forms, with or without 12 # Redistribution and use in source and binary forms, with or without
14 # modification, are permitted provided that the following conditions 13 # modification, are permitted provided that the following conditions
15 # are met: 14 # are met:
16 # 1. Redistributions of source code must retain the above copyright 15 # 1. Redistributions of source code must retain the above copyright
33 # 32 #
34 ############################################################################ 33 ############################################################################
35 34
36 import re, time, datetime, urllib, sys, BeautifulSoup 35 import re, time, datetime, urllib, sys, BeautifulSoup
37 36
37 debug = False
38
38 class Server: 39 class Server:
39 alltags = re.compile('<[^>]*>') 40 alltags = re.compile('<[^>]*>')
40 vwhttags = re.compile('<(br|hr)>') 41 vwhttags = re.compile('<(br|hr)>')
41 hwhttags = re.compile('\&nbsp;') 42 hwhttags = re.compile('\&nbsp;')
42 43 typetag = re.compile('<td><a href="/GameSearch/([^/]+)/.*</td>')
44
43 def __init__(self, description = "", ip = "", port = 0, mapname = "", 45 def __init__(self, description = "", ip = "", port = 0, mapname = "",
44 updateage = 0, numplayers = 0, maxplayers = 0, players = []): 46 updateage = 0, numplayers = 0, maxplayers = 0, players = []):
45 self.description = description 47 self.description = description
46 self.ip = ip 48 self.ip = ip
47 self.port = port 49 self.port = port
50 self.players = [] 52 self.players = []
51 self.numplayers = numplayers 53 self.numplayers = numplayers
52 self.maxplayers = maxplayers 54 self.maxplayers = maxplayers
53 55
54 def __init__(self, pcols, scols): 56 def __init__(self, pcols, scols):
55 # pcols[2] = Player name 57 # pcols[1] = Player name
56 # pcols[3] = Server description 58 # pcols[2] = Server description
57 # scols[0] = Players in server / max players 59 # scols[0] = Players in server / max players
58 # scols[2] = Server IP 60 # scols[1] = Server IP & port
59 # scols[3] = Server port 61 # scols[2] = Map name
60 # scols[4] = Map name 62 # scols[3] = Game type
61 # scols[10] = Update age 63 # scols[8] = Update age
62 self.tuplere = re.compile("\[?([0-9]+)/([0-9]+)\]?") 64 if debug:
63 self.description = pcols[3] 65 print "pcols = " + str(pcols)
64 self.ip = scols[2] 66 print "scols = " + str(scols)
65 self.port = int(scols[3]) 67
66 self.mapname = scols[4] 68 self.pcountre = re.compile("([0-9]+)/([0-9]+)")
67 self.updateage = scols[10] 69 self.ipportre = re.compile("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+):([0-9]+)")
68 m = self.tuplere.match(scols[0]) 70 self.sdesc = re.compile(" +[0-9]+\. +(.*)")
71
72 m = self.sdesc.match(pcols[2])
73 if (m == None):
74 raise SyntaxError
75 self.description = m.group(1)
76
77 m = self.ipportre.match(scols[1])
78 if (m == None):
79 raise SyntaxError
80
81 self.ip = m.group(1)
82 self.port = int(m.group(2))
83 self.gametype = scols[3]
84 self.mapname = scols[2]
85 self.updateage = scols[8]
86 m = self.pcountre.match(scols[0])
69 if (m == None): 87 if (m == None):
70 raise SyntaxError 88 raise SyntaxError
71 89
72 self.numplayers = int(m.group(1)) 90 self.numplayers = int(m.group(1))
73 self.maxplayers = int(m.group(2)) 91 self.maxplayers = int(m.group(2))
76 def __str__(self): 94 def __str__(self):
77 plist = "" 95 plist = ""
78 for p in self.players: 96 for p in self.players:
79 plist = plist + " " + str(p) 97 plist = plist + " " + str(p)
80 98
81 return "%s | Map: %s | Players: %d/%d : %s (%s old)" % \ 99 return "%s: %s (%s:%d) | Map: %s | Players: %d/%d : %s (%s old)" % \
82 (self.description, self.mapname, self.numplayers, self.maxplayers, \ 100 (self.gametype, self.description, self.ip, self.port, self.mapname,
83 plist, self.updateage) 101 self.numplayers, self.maxplayers, plist,
102 self.updateage)
84 103
85 def GetTuple(scols): 104 def GetTuple(scols):
86 return str(scols[2]) + ":" + str(scols[3]) 105 return str(scols[2]) + ":" + str(scols[3])
87 GetTuple = staticmethod(GetTuple) 106 GetTuple = staticmethod(GetTuple)
88 107
89 def FixTags(s): 108 def FixTags(s):
109 # Mangle game type
110 t = Server.typetag.match(s)
111 if t != None:
112 s = t.group(1)
90 s = re.sub(Server.vwhttags, '\n', s) 113 s = re.sub(Server.vwhttags, '\n', s)
91 s = re.sub(Server.hwhttags, '', s) 114 s = re.sub(Server.hwhttags, '', s)
92 s = str(BeautifulSoup.BeautifulStoneSoup( \ 115 s = str(BeautifulSoup.BeautifulStoneSoup( \
93 s, convertEntities = BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES)) 116 s, convertEntities = BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES))
94 s = re.sub(Server.alltags, '', s) 117 s = re.sub(Server.alltags, '', s)
96 FixTags = staticmethod(FixTags) 119 FixTags = staticmethod(FixTags)
97 120
98 def Scrape(handle): 121 def Scrape(handle):
99 s = BeautifulSoup.BeautifulSoup(handle) 122 s = BeautifulSoup.BeautifulSoup(handle)
100 123
101 playertbl = s.find("table", "search_table") 124 playertbl = s.find("table", "results")
102 if (playertbl == None): 125 if (playertbl == None):
103 #print "Unable to find results" 126 if True:
127 print "Unable to find results"
104 return None 128 return None
105 129
106 servertbl = playertbl.findNext("table", "search_table") 130 servertbl = playertbl.findNext("table")
107 131
108 playerrows = playertbl.findAll("tr") 132 playerrows = playertbl.findAll("tr")
109 serverrows = servertbl.findAll("tr") 133 serverrows = servertbl.findAll("tr")
110 134
111 if (len(playerrows) != len(serverrows)): 135 if (len(playerrows) != len(serverrows)):
114 138
115 servers = {} 139 servers = {}
116 for i in range(len(playerrows[1:])): 140 for i in range(len(playerrows[1:])):
117 pcols = playerrows[i].findAll('td') 141 pcols = playerrows[i].findAll('td')
118 scols = serverrows[i].findAll('td') 142 scols = serverrows[i].findAll('td')
119 if (len(pcols) != 4): 143 if (len(pcols) != 3):
144 if debug:
145 print "pcols has length %d, expected 3" % len(pcols)
120 continue 146 continue
121 147
122 pcols = map(lambda c : Server.FixTags(str(c)), pcols) 148 pcols = map(lambda c : Server.FixTags(str(c)), pcols)
123 scols = map(lambda c : Server.FixTags(str(c)), scols) 149 scols = map(lambda c : Server.FixTags(str(c)), scols)
124 150
126 152
127 if (stuple not in servers): 153 if (stuple not in servers):
128 s = Server(pcols, scols) 154 s = Server(pcols, scols)
129 servers[stuple] = s 155 servers[stuple] = s
130 156
131 servers[stuple].addplayer(pcols[2]) 157 servers[stuple].addplayer(pcols[1])
132 158
133 return servers 159 return servers
134 Scrape = staticmethod(Scrape) 160 Scrape = staticmethod(Scrape)
135 161
136 def addplayer(self, pname): 162 def addplayer(self, pname):
137 self.players.append(pname) 163 self.players.append(pname)
138 164
139 165
140 if (1): 166 if True:
141 maxhits = 10 167 maxhits = 10
142 if (len(sys.argv) < 2): 168 if (len(sys.argv) < 2):
143 print "Bad usage" 169 print "Bad usage"
144 print sys.argv[0] + "search_string" 170 print sys.argv[0] + "search_string"
145 sys.exit(1) 171 sys.exit(1)
146 172
147 try: 173 try:
148 #f = open("gm.html") 174 #f = open("gm.html")
149 f = urllib.urlopen("http://www.game-monitor.com/search.php?search=" + urllib.quote(sys.argv[1]) + "&type=player&location=AU") 175 f = urllib.urlopen("http://www.game-monitor.com/search.php?location=AU&search=" + urllib.quote(sys.argv[1]) + "&type=player&location=AU")
150 except IOError, e: 176 except IOError, e:
151 print "Unable to fetch page - " + str(e) 177 print "Unable to fetch page - " + str(e)
152 sys.exit(0) 178 sys.exit(0)
153 179
154 servers = Server.Scrape(f) 180 servers = Server.Scrape(f)
156 if (servers == None): 182 if (servers == None):
157 print "No results available, please check manually" 183 print "No results available, please check manually"
158 elif (len(servers) == 0): 184 elif (len(servers) == 0):
159 print "No players found" 185 print "No players found"
160 else: 186 else:
187 tmp = []
188 for i in servers:
189 tmp.append(servers[i])
190 tmp.sort()
161 i = 0 191 i = 0
162 for s in servers: 192 for s in tmp:
163 i = i + 1 193 i = i + 1
164 print servers[s] 194 print s
165 if (i >= maxhits): 195 if (i >= maxhits):
166 print "*** Stopping after " + str(maxhits) + " hits" 196 print "*** Stopping after " + str(maxhits) + " hits"
167 break 197 break