annotate scrape-gm.py @ 13:e8550290e512

Update parser for new format.
author darius@Inchoate
date Sun, 14 Dec 2008 18:48:15 +1030
parents ae9e833e4447
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
1 #!/usr/bin/env python
ac32969d1bec Initial revision
darius
parents:
diff changeset
2
ac32969d1bec Initial revision
darius
parents:
diff changeset
3 ############################################################################
ac32969d1bec Initial revision
darius
parents:
diff changeset
4 # Screen scraper for game-monitor.com
ac32969d1bec Initial revision
darius
parents:
diff changeset
5 #
ac32969d1bec Initial revision
darius
parents:
diff changeset
6 # Prints out matched player names agreated by server
ac32969d1bec Initial revision
darius
parents:
diff changeset
7 #
ac32969d1bec Initial revision
darius
parents:
diff changeset
8 ############################################################################
ac32969d1bec Initial revision
darius
parents:
diff changeset
9 #
11
22a51e8c0a69 Update copyright date. Remove CVS ID.
darius@inchoate.localdomain
parents: 10
diff changeset
10 # Copyright (C) 2008 Daniel O'Connor. All rights reserved.
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
11 #
ac32969d1bec Initial revision
darius
parents:
diff changeset
12 # Redistribution and use in source and binary forms, with or without
ac32969d1bec Initial revision
darius
parents:
diff changeset
13 # modification, are permitted provided that the following conditions
ac32969d1bec Initial revision
darius
parents:
diff changeset
14 # are met:
ac32969d1bec Initial revision
darius
parents:
diff changeset
15 # 1. Redistributions of source code must retain the above copyright
ac32969d1bec Initial revision
darius
parents:
diff changeset
16 # notice, this list of conditions and the following disclaimer.
ac32969d1bec Initial revision
darius
parents:
diff changeset
17 # 2. Redistributions in binary form must reproduce the above copyright
ac32969d1bec Initial revision
darius
parents:
diff changeset
18 # notice, this list of conditions and the following disclaimer in the
ac32969d1bec Initial revision
darius
parents:
diff changeset
19 # documentation and/or other materials provided with the distribution.
ac32969d1bec Initial revision
darius
parents:
diff changeset
20 #
ac32969d1bec Initial revision
darius
parents:
diff changeset
21 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
ac32969d1bec Initial revision
darius
parents:
diff changeset
22 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
ac32969d1bec Initial revision
darius
parents:
diff changeset
23 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ac32969d1bec Initial revision
darius
parents:
diff changeset
24 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
ac32969d1bec Initial revision
darius
parents:
diff changeset
25 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
ac32969d1bec Initial revision
darius
parents:
diff changeset
26 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
ac32969d1bec Initial revision
darius
parents:
diff changeset
27 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
ac32969d1bec Initial revision
darius
parents:
diff changeset
28 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
ac32969d1bec Initial revision
darius
parents:
diff changeset
29 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
ac32969d1bec Initial revision
darius
parents:
diff changeset
30 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
ac32969d1bec Initial revision
darius
parents:
diff changeset
31 # SUCH DAMAGE.
ac32969d1bec Initial revision
darius
parents:
diff changeset
32 #
ac32969d1bec Initial revision
darius
parents:
diff changeset
33 ############################################################################
ac32969d1bec Initial revision
darius
parents:
diff changeset
34
ac32969d1bec Initial revision
darius
parents:
diff changeset
35 import re, time, datetime, urllib, sys, BeautifulSoup
ac32969d1bec Initial revision
darius
parents:
diff changeset
36
13
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
37 debug = False
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
38
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
39 class Server:
ac32969d1bec Initial revision
darius
parents:
diff changeset
40 alltags = re.compile('<[^>]*>')
ac32969d1bec Initial revision
darius
parents:
diff changeset
41 vwhttags = re.compile('<(br|hr)>')
ac32969d1bec Initial revision
darius
parents:
diff changeset
42 hwhttags = re.compile('\&nbsp;')
12
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
43 typetag = re.compile('<td><a href="/GameSearch/([^/]+)/.*</td>')
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
44
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
45 def __init__(self, description = "", ip = "", port = 0, mapname = "",
ac32969d1bec Initial revision
darius
parents:
diff changeset
46 updateage = 0, numplayers = 0, maxplayers = 0, players = []):
ac32969d1bec Initial revision
darius
parents:
diff changeset
47 self.description = description
ac32969d1bec Initial revision
darius
parents:
diff changeset
48 self.ip = ip
ac32969d1bec Initial revision
darius
parents:
diff changeset
49 self.port = port
ac32969d1bec Initial revision
darius
parents:
diff changeset
50 self.mapname = mapname
ac32969d1bec Initial revision
darius
parents:
diff changeset
51 self.updateage = int(updateage)
ac32969d1bec Initial revision
darius
parents:
diff changeset
52 self.players = []
ac32969d1bec Initial revision
darius
parents:
diff changeset
53 self.numplayers = numplayers
ac32969d1bec Initial revision
darius
parents:
diff changeset
54 self.maxplayers = maxplayers
ac32969d1bec Initial revision
darius
parents:
diff changeset
55
ac32969d1bec Initial revision
darius
parents:
diff changeset
56 def __init__(self, pcols, scols):
13
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
57 # pcols[1] = Player name
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
58 # pcols[2] = Server description
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
59 # scols[0] = Players in server / max players
13
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
60 # scols[1] = Server IP & port
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
61 # scols[2] = Map name
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
62 # scols[3] = Game type
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
63 # scols[8] = Update age
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
64 if debug:
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
65 print "pcols = " + str(pcols)
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
66 print "scols = " + str(scols)
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
67
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
68 self.pcountre = re.compile("([0-9]+)/([0-9]+)")
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
69 self.ipportre = re.compile("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+):([0-9]+)")
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
70 self.sdesc = re.compile(" +[0-9]+\. +(.*)")
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
71
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
72 m = self.sdesc.match(pcols[2])
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
73 if (m == None):
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
74 raise SyntaxError
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
75 self.description = m.group(1)
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
76
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
77 m = self.ipportre.match(scols[1])
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
78 if (m == None):
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
79 raise SyntaxError
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
80
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
81 self.ip = m.group(1)
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
82 self.port = int(m.group(2))
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
83 self.gametype = scols[3]
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
84 self.mapname = scols[2]
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
85 self.updateage = scols[8]
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
86 m = self.pcountre.match(scols[0])
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
87 if (m == None):
ac32969d1bec Initial revision
darius
parents:
diff changeset
88 raise SyntaxError
ac32969d1bec Initial revision
darius
parents:
diff changeset
89
ac32969d1bec Initial revision
darius
parents:
diff changeset
90 self.numplayers = int(m.group(1))
ac32969d1bec Initial revision
darius
parents:
diff changeset
91 self.maxplayers = int(m.group(2))
ac32969d1bec Initial revision
darius
parents:
diff changeset
92 self.players = []
ac32969d1bec Initial revision
darius
parents:
diff changeset
93
ac32969d1bec Initial revision
darius
parents:
diff changeset
94 def __str__(self):
ac32969d1bec Initial revision
darius
parents:
diff changeset
95 plist = ""
ac32969d1bec Initial revision
darius
parents:
diff changeset
96 for p in self.players:
ac32969d1bec Initial revision
darius
parents:
diff changeset
97 plist = plist + " " + str(p)
ac32969d1bec Initial revision
darius
parents:
diff changeset
98
12
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
99 return "%s: %s (%s:%d) | Map: %s | Players: %d/%d : %s (%s old)" % \
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
100 (self.gametype, self.description, self.ip, self.port, self.mapname,
7
825302e32c35 Print out the server IP & port tuple as well as the name.
darius@inchoate.localdomain
parents: 6
diff changeset
101 self.numplayers, self.maxplayers, plist,
825302e32c35 Print out the server IP & port tuple as well as the name.
darius@inchoate.localdomain
parents: 6
diff changeset
102 self.updateage)
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
103
ac32969d1bec Initial revision
darius
parents:
diff changeset
104 def GetTuple(scols):
ac32969d1bec Initial revision
darius
parents:
diff changeset
105 return str(scols[2]) + ":" + str(scols[3])
ac32969d1bec Initial revision
darius
parents:
diff changeset
106 GetTuple = staticmethod(GetTuple)
ac32969d1bec Initial revision
darius
parents:
diff changeset
107
ac32969d1bec Initial revision
darius
parents:
diff changeset
108 def FixTags(s):
12
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
109 # Mangle game type
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
110 t = Server.typetag.match(s)
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
111 if t != None:
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
112 s = t.group(1)
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
113 s = re.sub(Server.vwhttags, '\n', s)
ac32969d1bec Initial revision
darius
parents:
diff changeset
114 s = re.sub(Server.hwhttags, '', s)
ac32969d1bec Initial revision
darius
parents:
diff changeset
115 s = str(BeautifulSoup.BeautifulStoneSoup( \
ac32969d1bec Initial revision
darius
parents:
diff changeset
116 s, convertEntities = BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES))
ac32969d1bec Initial revision
darius
parents:
diff changeset
117 s = re.sub(Server.alltags, '', s)
ac32969d1bec Initial revision
darius
parents:
diff changeset
118 return(s)
ac32969d1bec Initial revision
darius
parents:
diff changeset
119 FixTags = staticmethod(FixTags)
ac32969d1bec Initial revision
darius
parents:
diff changeset
120
ac32969d1bec Initial revision
darius
parents:
diff changeset
121 def Scrape(handle):
ac32969d1bec Initial revision
darius
parents:
diff changeset
122 s = BeautifulSoup.BeautifulSoup(handle)
ac32969d1bec Initial revision
darius
parents:
diff changeset
123
10
0e18c714b69d Update for new page layout.
darius@inchoate.localdomain
parents: 8
diff changeset
124 playertbl = s.find("table", "results")
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
125 if (playertbl == None):
13
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
126 if True:
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
127 print "Unable to find results"
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
128 return None
ac32969d1bec Initial revision
darius
parents:
diff changeset
129
12
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
130 servertbl = playertbl.findNext("table")
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
131
ac32969d1bec Initial revision
darius
parents:
diff changeset
132 playerrows = playertbl.findAll("tr")
ac32969d1bec Initial revision
darius
parents:
diff changeset
133 serverrows = servertbl.findAll("tr")
ac32969d1bec Initial revision
darius
parents:
diff changeset
134
ac32969d1bec Initial revision
darius
parents:
diff changeset
135 if (len(playerrows) != len(serverrows)):
ac32969d1bec Initial revision
darius
parents:
diff changeset
136 print "Internal error 41223"
ac32969d1bec Initial revision
darius
parents:
diff changeset
137 return
ac32969d1bec Initial revision
darius
parents:
diff changeset
138
ac32969d1bec Initial revision
darius
parents:
diff changeset
139 servers = {}
ac32969d1bec Initial revision
darius
parents:
diff changeset
140 for i in range(len(playerrows[1:])):
ac32969d1bec Initial revision
darius
parents:
diff changeset
141 pcols = playerrows[i].findAll('td')
ac32969d1bec Initial revision
darius
parents:
diff changeset
142 scols = serverrows[i].findAll('td')
13
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
143 if (len(pcols) != 3):
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
144 if debug:
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
145 print "pcols has length %d, expected 3" % len(pcols)
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
146 continue
ac32969d1bec Initial revision
darius
parents:
diff changeset
147
ac32969d1bec Initial revision
darius
parents:
diff changeset
148 pcols = map(lambda c : Server.FixTags(str(c)), pcols)
ac32969d1bec Initial revision
darius
parents:
diff changeset
149 scols = map(lambda c : Server.FixTags(str(c)), scols)
ac32969d1bec Initial revision
darius
parents:
diff changeset
150
ac32969d1bec Initial revision
darius
parents:
diff changeset
151 stuple = Server.GetTuple(scols)
ac32969d1bec Initial revision
darius
parents:
diff changeset
152
ac32969d1bec Initial revision
darius
parents:
diff changeset
153 if (stuple not in servers):
ac32969d1bec Initial revision
darius
parents:
diff changeset
154 s = Server(pcols, scols)
ac32969d1bec Initial revision
darius
parents:
diff changeset
155 servers[stuple] = s
ac32969d1bec Initial revision
darius
parents:
diff changeset
156
13
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
157 servers[stuple].addplayer(pcols[1])
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
158
ac32969d1bec Initial revision
darius
parents:
diff changeset
159 return servers
ac32969d1bec Initial revision
darius
parents:
diff changeset
160 Scrape = staticmethod(Scrape)
ac32969d1bec Initial revision
darius
parents:
diff changeset
161
ac32969d1bec Initial revision
darius
parents:
diff changeset
162 def addplayer(self, pname):
ac32969d1bec Initial revision
darius
parents:
diff changeset
163 self.players.append(pname)
ac32969d1bec Initial revision
darius
parents:
diff changeset
164
ac32969d1bec Initial revision
darius
parents:
diff changeset
165
13
e8550290e512 Update parser for new format.
darius@Inchoate
parents: 12
diff changeset
166 if True:
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
167 maxhits = 10
ac32969d1bec Initial revision
darius
parents:
diff changeset
168 if (len(sys.argv) < 2):
ac32969d1bec Initial revision
darius
parents:
diff changeset
169 print "Bad usage"
ac32969d1bec Initial revision
darius
parents:
diff changeset
170 print sys.argv[0] + "search_string"
ac32969d1bec Initial revision
darius
parents:
diff changeset
171 sys.exit(1)
ac32969d1bec Initial revision
darius
parents:
diff changeset
172
ac32969d1bec Initial revision
darius
parents:
diff changeset
173 try:
ac32969d1bec Initial revision
darius
parents:
diff changeset
174 #f = open("gm.html")
8
fbaf34d7bab7 Only list Australian servers.
darius@inchoate.localdomain
parents: 7
diff changeset
175 f = urllib.urlopen("http://www.game-monitor.com/search.php?location=AU&search=" + urllib.quote(sys.argv[1]) + "&type=player&location=AU")
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
176 except IOError, e:
ac32969d1bec Initial revision
darius
parents:
diff changeset
177 print "Unable to fetch page - " + str(e)
ac32969d1bec Initial revision
darius
parents:
diff changeset
178 sys.exit(0)
ac32969d1bec Initial revision
darius
parents:
diff changeset
179
ac32969d1bec Initial revision
darius
parents:
diff changeset
180 servers = Server.Scrape(f)
ac32969d1bec Initial revision
darius
parents:
diff changeset
181 del f
6
5058c2695109 Properly report if we can't parse the results.
darius
parents: 5
diff changeset
182 if (servers == None):
5058c2695109 Properly report if we can't parse the results.
darius
parents: 5
diff changeset
183 print "No results available, please check manually"
5058c2695109 Properly report if we can't parse the results.
darius
parents: 5
diff changeset
184 elif (len(servers) == 0):
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
185 print "No players found"
ac32969d1bec Initial revision
darius
parents:
diff changeset
186 else:
12
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
187 tmp = []
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
188 for i in servers:
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
189 tmp.append(servers[i])
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
190 tmp.sort()
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
191 i = 0
12
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
192 for s in tmp:
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
193 i = i + 1
12
ae9e833e4447 Get game type.
darius@inchoate.localdomain
parents: 11
diff changeset
194 print s
1
ac32969d1bec Initial revision
darius
parents:
diff changeset
195 if (i >= maxhits):
ac32969d1bec Initial revision
darius
parents:
diff changeset
196 print "*** Stopping after " + str(maxhits) + " hits"
ac32969d1bec Initial revision
darius
parents:
diff changeset
197 break