Mercurial > ~darius > hgwebdir.cgi > scrape-gm
comparison scrape-gm.py @ 15:789cf10ce4c9
Update for new format (for sure)
author | darius@Inchoate |
---|---|
date | Sun, 14 Dec 2008 18:55:39 +1030 |
parents | 5058c2695109 |
children | eeee17d2072c |
comparison
equal
deleted
inserted
replaced
14:eec2fc32ca88 | 15:789cf10ce4c9 |
---|---|
3 ############################################################################ | 3 ############################################################################ |
4 # Screen scraper for game-monitor.com | 4 # Screen scraper for game-monitor.com |
5 # | 5 # |
6 # Prints out matched player names agreated by server | 6 # Prints out matched player names agreated by server |
7 # | 7 # |
8 # $Id: scrape-gm.py,v 1.3 2007/11/18 08:54:07 darius Exp $ | |
9 ############################################################################ | 8 ############################################################################ |
10 # | 9 # |
11 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | 10 # Copyright (C) 2008 Daniel O'Connor. All rights reserved. |
12 # | 11 # |
13 # Redistribution and use in source and binary forms, with or without | 12 # Redistribution and use in source and binary forms, with or without |
14 # modification, are permitted provided that the following conditions | 13 # modification, are permitted provided that the following conditions |
15 # are met: | 14 # are met: |
16 # 1. Redistributions of source code must retain the above copyright | 15 # 1. Redistributions of source code must retain the above copyright |
33 # | 32 # |
34 ############################################################################ | 33 ############################################################################ |
35 | 34 |
36 import re, time, datetime, urllib, sys, BeautifulSoup | 35 import re, time, datetime, urllib, sys, BeautifulSoup |
37 | 36 |
37 debug = False | |
38 | |
38 class Server: | 39 class Server: |
39 alltags = re.compile('<[^>]*>') | 40 alltags = re.compile('<[^>]*>') |
40 vwhttags = re.compile('<(br|hr)>') | 41 vwhttags = re.compile('<(br|hr)>') |
41 hwhttags = re.compile('\ ') | 42 hwhttags = re.compile('\ ') |
42 | 43 typetag = re.compile('<td><a href="/GameSearch/([^/]+)/.*</td>') |
44 | |
43 def __init__(self, description = "", ip = "", port = 0, mapname = "", | 45 def __init__(self, description = "", ip = "", port = 0, mapname = "", |
44 updateage = 0, numplayers = 0, maxplayers = 0, players = []): | 46 updateage = 0, numplayers = 0, maxplayers = 0, players = []): |
45 self.description = description | 47 self.description = description |
46 self.ip = ip | 48 self.ip = ip |
47 self.port = port | 49 self.port = port |
50 self.players = [] | 52 self.players = [] |
51 self.numplayers = numplayers | 53 self.numplayers = numplayers |
52 self.maxplayers = maxplayers | 54 self.maxplayers = maxplayers |
53 | 55 |
54 def __init__(self, pcols, scols): | 56 def __init__(self, pcols, scols): |
55 # pcols[2] = Player name | 57 # pcols[1] = Player name |
56 # pcols[3] = Server description | 58 # pcols[2] = Server description |
57 # scols[0] = Players in server / max players | 59 # scols[0] = Players in server / max players |
58 # scols[2] = Server IP | 60 # scols[1] = Server IP & port |
59 # scols[3] = Server port | 61 # scols[2] = Map name |
60 # scols[4] = Map name | 62 # scols[3] = Game type |
61 # scols[10] = Update age | 63 # scols[8] = Update age |
62 self.tuplere = re.compile("\[?([0-9]+)/([0-9]+)\]?") | 64 if debug: |
63 self.description = pcols[3] | 65 print "pcols = " + str(pcols) |
64 self.ip = scols[2] | 66 print "scols = " + str(scols) |
65 self.port = int(scols[3]) | 67 |
66 self.mapname = scols[4] | 68 self.pcountre = re.compile("([0-9]+)/([0-9]+)") |
67 self.updateage = scols[10] | 69 self.ipportre = re.compile("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+):([0-9]+)") |
68 m = self.tuplere.match(scols[0]) | 70 self.sdesc = re.compile(" +[0-9]+\. +(.*)") |
71 | |
72 m = self.sdesc.match(pcols[2]) | |
73 if (m == None): | |
74 raise SyntaxError | |
75 self.description = m.group(1) | |
76 | |
77 m = self.ipportre.match(scols[1]) | |
78 if (m == None): | |
79 raise SyntaxError | |
80 | |
81 self.ip = m.group(1) | |
82 self.port = int(m.group(2)) | |
83 self.gametype = scols[3] | |
84 self.mapname = scols[2] | |
85 self.updateage = scols[8] | |
86 m = self.pcountre.match(scols[0]) | |
69 if (m == None): | 87 if (m == None): |
70 raise SyntaxError | 88 raise SyntaxError |
71 | 89 |
72 self.numplayers = int(m.group(1)) | 90 self.numplayers = int(m.group(1)) |
73 self.maxplayers = int(m.group(2)) | 91 self.maxplayers = int(m.group(2)) |
76 def __str__(self): | 94 def __str__(self): |
77 plist = "" | 95 plist = "" |
78 for p in self.players: | 96 for p in self.players: |
79 plist = plist + " " + str(p) | 97 plist = plist + " " + str(p) |
80 | 98 |
81 return "%s | Map: %s | Players: %d/%d : %s (%s old)" % \ | 99 return "%s: %s (%s:%d) | Map: %s | Players: %d/%d : %s (%s old)" % \ |
82 (self.description, self.mapname, self.numplayers, self.maxplayers, \ | 100 (self.gametype, self.description, self.ip, self.port, self.mapname, |
83 plist, self.updateage) | 101 self.numplayers, self.maxplayers, plist, |
102 self.updateage) | |
84 | 103 |
85 def GetTuple(scols): | 104 def GetTuple(scols): |
86 return str(scols[2]) + ":" + str(scols[3]) | 105 return str(scols[2]) + ":" + str(scols[3]) |
87 GetTuple = staticmethod(GetTuple) | 106 GetTuple = staticmethod(GetTuple) |
88 | 107 |
89 def FixTags(s): | 108 def FixTags(s): |
109 # Mangle game type | |
110 t = Server.typetag.match(s) | |
111 if t != None: | |
112 s = t.group(1) | |
90 s = re.sub(Server.vwhttags, '\n', s) | 113 s = re.sub(Server.vwhttags, '\n', s) |
91 s = re.sub(Server.hwhttags, '', s) | 114 s = re.sub(Server.hwhttags, '', s) |
92 s = str(BeautifulSoup.BeautifulStoneSoup( \ | 115 s = str(BeautifulSoup.BeautifulStoneSoup( \ |
93 s, convertEntities = BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES)) | 116 s, convertEntities = BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES)) |
94 s = re.sub(Server.alltags, '', s) | 117 s = re.sub(Server.alltags, '', s) |
96 FixTags = staticmethod(FixTags) | 119 FixTags = staticmethod(FixTags) |
97 | 120 |
98 def Scrape(handle): | 121 def Scrape(handle): |
99 s = BeautifulSoup.BeautifulSoup(handle) | 122 s = BeautifulSoup.BeautifulSoup(handle) |
100 | 123 |
101 playertbl = s.find("table", "search_table") | 124 playertbl = s.find("table", "results") |
102 if (playertbl == None): | 125 if (playertbl == None): |
103 #print "Unable to find results" | 126 if True: |
127 print "Unable to find results" | |
104 return None | 128 return None |
105 | 129 |
106 servertbl = playertbl.findNext("table", "search_table") | 130 servertbl = playertbl.findNext("table") |
107 | 131 |
108 playerrows = playertbl.findAll("tr") | 132 playerrows = playertbl.findAll("tr") |
109 serverrows = servertbl.findAll("tr") | 133 serverrows = servertbl.findAll("tr") |
110 | 134 |
111 if (len(playerrows) != len(serverrows)): | 135 if (len(playerrows) != len(serverrows)): |
114 | 138 |
115 servers = {} | 139 servers = {} |
116 for i in range(len(playerrows[1:])): | 140 for i in range(len(playerrows[1:])): |
117 pcols = playerrows[i].findAll('td') | 141 pcols = playerrows[i].findAll('td') |
118 scols = serverrows[i].findAll('td') | 142 scols = serverrows[i].findAll('td') |
119 if (len(pcols) != 4): | 143 if (len(pcols) != 3): |
144 if debug: | |
145 print "pcols has length %d, expected 3" % len(pcols) | |
120 continue | 146 continue |
121 | 147 |
122 pcols = map(lambda c : Server.FixTags(str(c)), pcols) | 148 pcols = map(lambda c : Server.FixTags(str(c)), pcols) |
123 scols = map(lambda c : Server.FixTags(str(c)), scols) | 149 scols = map(lambda c : Server.FixTags(str(c)), scols) |
124 | 150 |
126 | 152 |
127 if (stuple not in servers): | 153 if (stuple not in servers): |
128 s = Server(pcols, scols) | 154 s = Server(pcols, scols) |
129 servers[stuple] = s | 155 servers[stuple] = s |
130 | 156 |
131 servers[stuple].addplayer(pcols[2]) | 157 servers[stuple].addplayer(pcols[1]) |
132 | 158 |
133 return servers | 159 return servers |
134 Scrape = staticmethod(Scrape) | 160 Scrape = staticmethod(Scrape) |
135 | 161 |
136 def addplayer(self, pname): | 162 def addplayer(self, pname): |
137 self.players.append(pname) | 163 self.players.append(pname) |
138 | 164 |
139 | 165 |
140 if (1): | 166 if True: |
141 maxhits = 10 | 167 maxhits = 10 |
142 if (len(sys.argv) < 2): | 168 if (len(sys.argv) < 2): |
143 print "Bad usage" | 169 print "Bad usage" |
144 print sys.argv[0] + "search_string" | 170 print sys.argv[0] + "search_string" |
145 sys.exit(1) | 171 sys.exit(1) |
146 | 172 |
147 try: | 173 try: |
148 #f = open("gm.html") | 174 #f = open("gm.html") |
149 f = urllib.urlopen("http://www.game-monitor.com/search.php?search=" + urllib.quote(sys.argv[1]) + "&type=player&location=AU") | 175 f = urllib.urlopen("http://www.game-monitor.com/search.php?location=AU&search=" + urllib.quote(sys.argv[1]) + "&type=player&location=AU") |
150 except IOError, e: | 176 except IOError, e: |
151 print "Unable to fetch page - " + str(e) | 177 print "Unable to fetch page - " + str(e) |
152 sys.exit(0) | 178 sys.exit(0) |
153 | 179 |
154 servers = Server.Scrape(f) | 180 servers = Server.Scrape(f) |
156 if (servers == None): | 182 if (servers == None): |
157 print "No results available, please check manually" | 183 print "No results available, please check manually" |
158 elif (len(servers) == 0): | 184 elif (len(servers) == 0): |
159 print "No players found" | 185 print "No players found" |
160 else: | 186 else: |
187 tmp = [] | |
188 for i in servers: | |
189 tmp.append(servers[i]) | |
190 tmp.sort() | |
161 i = 0 | 191 i = 0 |
162 for s in servers: | 192 for s in tmp: |
163 i = i + 1 | 193 i = i + 1 |
164 print servers[s] | 194 print s |
165 if (i >= maxhits): | 195 if (i >= maxhits): |
166 print "*** Stopping after " + str(maxhits) + " hits" | 196 print "*** Stopping after " + str(maxhits) + " hits" |
167 break | 197 break |