Mercurial > ~darius > hgwebdir.cgi > scrape-gm
annotate scrape-gm.py @ 10:0e18c714b69d
Update for new page layout.
Luckily no substative changes! :)
author | darius@inchoate.localdomain |
---|---|
date | Fri, 29 Feb 2008 19:19:14 +1030 |
parents | fbaf34d7bab7 |
children | 22a51e8c0a69 |
rev | line source |
---|---|
1 | 1 #!/usr/bin/env python |
2 | |
3 ############################################################################ | |
4 # Screen scraper for game-monitor.com | |
5 # | |
6 # Prints out matched player names agreated by server | |
7 # | |
6 | 8 # $Id: scrape-gm.py,v 1.3 2007/11/18 08:54:07 darius Exp $ |
1 | 9 ############################################################################ |
10 # | |
11 # Copyright (C) 2007 Daniel O'Connor. All rights reserved. | |
12 # | |
13 # Redistribution and use in source and binary forms, with or without | |
14 # modification, are permitted provided that the following conditions | |
15 # are met: | |
16 # 1. Redistributions of source code must retain the above copyright | |
17 # notice, this list of conditions and the following disclaimer. | |
18 # 2. Redistributions in binary form must reproduce the above copyright | |
19 # notice, this list of conditions and the following disclaimer in the | |
20 # documentation and/or other materials provided with the distribution. | |
21 # | |
22 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
23 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
24 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
25 # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE | |
26 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
27 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
28 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
29 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
30 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
31 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 # SUCH DAMAGE. | |
33 # | |
34 ############################################################################ | |
35 | |
36 import re, time, datetime, urllib, sys, BeautifulSoup | |
37 | |
38 class Server: | |
39 alltags = re.compile('<[^>]*>') | |
40 vwhttags = re.compile('<(br|hr)>') | |
41 hwhttags = re.compile('\ ') | |
42 | |
43 def __init__(self, description = "", ip = "", port = 0, mapname = "", | |
44 updateage = 0, numplayers = 0, maxplayers = 0, players = []): | |
45 self.description = description | |
46 self.ip = ip | |
47 self.port = port | |
48 self.mapname = mapname | |
49 self.updateage = int(updateage) | |
50 self.players = [] | |
51 self.numplayers = numplayers | |
52 self.maxplayers = maxplayers | |
53 | |
54 def __init__(self, pcols, scols): | |
55 # pcols[2] = Player name | |
56 # pcols[3] = Server description | |
57 # scols[0] = Players in server / max players | |
58 # scols[2] = Server IP | |
59 # scols[3] = Server port | |
60 # scols[4] = Map name | |
61 # scols[10] = Update age | |
62 self.tuplere = re.compile("\[?([0-9]+)/([0-9]+)\]?") | |
63 self.description = pcols[3] | |
64 self.ip = scols[2] | |
65 self.port = int(scols[3]) | |
66 self.mapname = scols[4] | |
67 self.updateage = scols[10] | |
68 m = self.tuplere.match(scols[0]) | |
69 if (m == None): | |
70 raise SyntaxError | |
71 | |
72 self.numplayers = int(m.group(1)) | |
73 self.maxplayers = int(m.group(2)) | |
74 self.players = [] | |
75 | |
76 def __str__(self): | |
77 plist = "" | |
78 for p in self.players: | |
79 plist = plist + " " + str(p) | |
80 | |
7
825302e32c35
Print out the server IP & port tuple as well as the name.
darius@inchoate.localdomain
parents:
6
diff
changeset
|
81 return "%s (%s:%d) | Map: %s | Players: %d/%d : %s (%s old)" % \ |
825302e32c35
Print out the server IP & port tuple as well as the name.
darius@inchoate.localdomain
parents:
6
diff
changeset
|
82 (self.description, self.ip, self.port, self.mapname, |
825302e32c35
Print out the server IP & port tuple as well as the name.
darius@inchoate.localdomain
parents:
6
diff
changeset
|
83 self.numplayers, self.maxplayers, plist, |
825302e32c35
Print out the server IP & port tuple as well as the name.
darius@inchoate.localdomain
parents:
6
diff
changeset
|
84 self.updateage) |
1 | 85 |
86 def GetTuple(scols): | |
87 return str(scols[2]) + ":" + str(scols[3]) | |
88 GetTuple = staticmethod(GetTuple) | |
89 | |
90 def FixTags(s): | |
91 s = re.sub(Server.vwhttags, '\n', s) | |
92 s = re.sub(Server.hwhttags, '', s) | |
93 s = str(BeautifulSoup.BeautifulStoneSoup( \ | |
94 s, convertEntities = BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES)) | |
95 s = re.sub(Server.alltags, '', s) | |
96 return(s) | |
97 FixTags = staticmethod(FixTags) | |
98 | |
99 def Scrape(handle): | |
100 s = BeautifulSoup.BeautifulSoup(handle) | |
101 | |
10 | 102 playertbl = s.find("table", "results") |
1 | 103 if (playertbl == None): |
6 | 104 #print "Unable to find results" |
1 | 105 return None |
106 | |
10 | 107 servertbl = playertbl.findNext("table", "results") |
1 | 108 |
109 playerrows = playertbl.findAll("tr") | |
110 serverrows = servertbl.findAll("tr") | |
111 | |
112 if (len(playerrows) != len(serverrows)): | |
113 print "Internal error 41223" | |
114 return | |
115 | |
116 servers = {} | |
117 for i in range(len(playerrows[1:])): | |
118 pcols = playerrows[i].findAll('td') | |
119 scols = serverrows[i].findAll('td') | |
120 if (len(pcols) != 4): | |
121 continue | |
122 | |
123 pcols = map(lambda c : Server.FixTags(str(c)), pcols) | |
124 scols = map(lambda c : Server.FixTags(str(c)), scols) | |
125 | |
126 stuple = Server.GetTuple(scols) | |
127 | |
128 if (stuple not in servers): | |
129 s = Server(pcols, scols) | |
130 servers[stuple] = s | |
131 | |
132 servers[stuple].addplayer(pcols[2]) | |
133 | |
134 return servers | |
135 Scrape = staticmethod(Scrape) | |
136 | |
137 def addplayer(self, pname): | |
138 self.players.append(pname) | |
139 | |
140 | |
141 if (1): | |
142 maxhits = 10 | |
143 if (len(sys.argv) < 2): | |
144 print "Bad usage" | |
145 print sys.argv[0] + "search_string" | |
146 sys.exit(1) | |
147 | |
148 try: | |
149 #f = open("gm.html") | |
8 | 150 f = urllib.urlopen("http://www.game-monitor.com/search.php?location=AU&search=" + urllib.quote(sys.argv[1]) + "&type=player&location=AU") |
1 | 151 except IOError, e: |
152 print "Unable to fetch page - " + str(e) | |
153 sys.exit(0) | |
154 | |
155 servers = Server.Scrape(f) | |
156 del f | |
6 | 157 if (servers == None): |
158 print "No results available, please check manually" | |
159 elif (len(servers) == 0): | |
1 | 160 print "No players found" |
161 else: | |
162 i = 0 | |
163 for s in servers: | |
164 i = i + 1 | |
165 print servers[s] | |
166 if (i >= maxhits): | |
167 print "*** Stopping after " + str(maxhits) + " hits" | |
168 break |