1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
| from datetime import datetime import re import time from lxml import etree import requests import pymysql
header = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 'Accept-Language': 'zh-CN,zh;q=0.9', "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0", }
DB_CFG = dict( host="localhost", port=3306, user="root", password="123456", db="steam", charset="utf8mb4" )
def init_db(): conn = pymysql.connect(**DB_CFG) try: with conn.cursor() as cur: cur.execute("CREATE DATABASE IF NOT EXISTS steam CHARACTER SET utf8mb4;") conn.select_db(DB_CFG["db"])
create_sql = """ CREATE TABLE IF NOT EXISTS steam_sql ( 用户名 VARCHAR(128), 评论页面 VARCHAR(512) PRIMARY KEY, 评价 VARCHAR(32), 评论 TEXT, 状态 VARCHAR(32) NULL, 有价值数 INT NULL, 欢乐数 INT NULL, 发布时间 DATETIME NULL, 修改时间 DATETIME NULL, 评论人数 INT NULL, 两周内时长 VARCHAR(32) NULL, 总时长 VARCHAR(32) NULL, 等级 INT NULL, 徽章数 INT NULL, 库存数 INT NULL, 评测数 INT NULL, 好友数 INT NULL, 组数 INT NULL, 游戏1 VARCHAR(128) NULL, 时长1 VARCHAR(32) NULL, 成就1 VARCHAR(32) NULL, 游戏2 VARCHAR(128) NULL, 时长2 VARCHAR(32) NULL, 成就2 VARCHAR(32) NULL, 游戏3 VARCHAR(128) NULL, 时长3 VARCHAR(32) NULL, 成就3 VARCHAR(32) NULL, 抓取时间 TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; """ cur.execute(create_sql) conn.commit() finally: conn.close()
def save_to_db(row): """ row 是 get_info() 返回的整条 tuple """ conn = pymysql.connect(**DB_CFG) try: with conn.cursor() as cur: sql = """ INSERT INTO steam_sql ( 用户名,评论页面,评价,评论,状态,有价值数,欢乐数, 发布时间,修改时间,评论人数,两周内时长,总时长, 等级,徽章数,库存数,评测数,好友数,组数, 游戏1,时长1,成就1,游戏2,时长2,成就2,游戏3,时长3,成就3 ) VALUES ( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s ) ON DUPLICATE KEY UPDATE 用户名=VALUES(用户名), 评价=VALUES(评价), 评论=VALUES(评论), 状态=VALUES(状态), 有价值数=VALUES(有价值数), 欢乐数=VALUES(欢乐数), 发布时间=VALUES(发布时间), 修改时间=VALUES(修改时间), 评论人数=VALUES(评论人数), 两周内时长=VALUES(两周内时长), 总时长=VALUES(总时长), 等级=VALUES(等级), 徽章数=VALUES(徽章数), 库存数=VALUES(库存数), 评测数=VALUES(评测数), 好友数=VALUES(好友数), 组数=VALUES(组数), 游戏1=VALUES(游戏1), 时长1=VALUES(时长1), 成就1=VALUES(成就1), 游戏2=VALUES(游戏2), 时长2=VALUES(时长2), 成就2=VALUES(成就2), 游戏3=VALUES(游戏3), 时长3=VALUES(时长3), 成就3=VALUES(成就3), 抓取时间=CURRENT_TIMESTAMP """ cur.execute(sql, row) conn.commit() finally: conn.close()
def read_urls_from_file(file_path): urls = [] with open(file_path, 'r', encoding='utf-8') as file: for line in file: line = line.strip() if line: urls.append(line) return urls
def format_steam_time(raw): cleaned = raw.replace("发布于:", "").replace("更新于:", "") full_match = re.search( r'(?:(\d{4}) 年 )?(\d{1,2}) 月 (\d{1,2}) 日 (上午|下午) (\d{1,2}):(\d{2})', cleaned ) if not full_match: return None year_str, month, day, period, hour, minute = full_match.groups()
year = int(year_str) if year_str else datetime.now().year
hour = int(hour) if period == '下午' and hour != 12: hour += 12 if period == '上午' and hour == 12: hour = 0 dt = datetime(year, int(month), int(day), hour, int(minute)) return dt.strftime("%Y-%m-%d %H:%M")
def user_info(user_url,header): response = etree.HTML(requests.get(url=user_url,headers=header).content.decode('utf-8'))
try: test = response.xpath('//*[@id="mainContents"]/div/h1/text()')[0] while test == '抱歉!': print("当前页面被反爬,暂停10秒") time.sleep(10) response = etree.HTML(requests.get(url=user_url,headers=header).content.decode('utf-8')) test = response.xpath('//*[@id="mainContents"]/div/h1/text()')[0] except: pass
try: level = response.xpath('//*[@class="friendPlayerLevelNum"]/text()')[0] level = int(level) except: level = None try: badges = response.xpath('//span[contains(text(),"徽章")]/following-sibling::span/text()')[0] badges = int(badges.replace(',', '')) if badges else 0 except: badges = None
try: games = response.xpath('//span[contains(text(),"游戏")]/following-sibling::span/text()')[0] games = int(games.replace(',', '')) if games else 0 except: games = None
try: reviews = response.xpath('//span[contains(text(),"评测")]/following-sibling::span/text()')[0] recommended = int(reviews.replace(',', '')) if reviews else 0 except: recommended = None
try: friends = response.xpath('//span[contains(text(),"好友")]/following-sibling::span/text()')[0] friends = int(friends.replace(',', '')) if friends else 0 except: friends = None
try: groups = response.xpath('//span[contains(text(),"组")]/following-sibling::span/text()')[0] groups = int(groups.replace(',', '')) if groups else 0 except: groups = None
games_list = response.xpath('//div[@class="recent_game_content"]')
game = [None] * 3 play_time = [None] * 3 chievement = [None] * 3 for i in range(3): try: game[i] = games_list[i].xpath('.//div[@class="game_name"]/a/text()')[0].strip() except: game[i] = None try: play_time[i] = games_list[i].xpath('.//div[@class="game_info_details"]/text()')[0].strip().split(' ')[1].replace(",",'') except: play_time[i] = None try: chievement[i] = games_list[i].xpath('.//span[@class="ellipsis"]/text()')[0].strip().replace(" of ", "/") except: chievement[i] = None
return level,badges,games,recommended,friends,groups,game[0],play_time[0],chievement[0],game[1],play_time[1],chievement[1],game[2],play_time[2],chievement[2]
def get_info(url,header): response = etree.HTML(requests.get(url=url,headers=header).content.decode('utf-8'))
user_name = response.xpath('//*[@id="responsive_page_template_content"]/div/div[1]/div/div/span[1]/a/text()')[0].strip()
user_url = response.xpath('//*[@id="responsive_page_template_content"]/div/div[1]/div/div/span[1]/a/@href')[0].strip()
valuation = response.xpath('//*[@id="ReviewTitle"]/div[1]/div[1]/text()')[0].strip()
recommend = ' '.join(' '.join(response.xpath('//*[@id="ReviewText"]//text()')).split()).replace('\u200b','')
try: status = response.xpath('//*[@class="refunded tooltip"]//text()')[0].strip()[2:] except: status = "未退款"
try: help_text = response.xpath('//*[@id="leftContents"]/div[2]/text()[1]')[0] help = int(re.sub(r'[^\d]', '', help_text)) if help_text else 0 except: help = 0
try: happy_text = response.xpath('//*[@id="leftContents"]/div[2]/text()[2]')[0] happy = int(re.sub(r'[^\d]', '', happy_text)) if happy_text else 0 except: happy = 0
posted_time = format_steam_time(response.xpath('//*[@id="ReviewTitle"]/div[2]/text()[1]')[0].strip())
try: updated_time = format_steam_time(response.xpath('//*[@id="ReviewTitle"]/div[2]/text()[2]')[0].strip()) except: updated_time = None
try: relpy_counts = response.xpath('//div[@class="commentthread_header_and_count"]//span[contains(@id,"totalcount")]/text()')[0] relpy_counts = int(re.sub(r'[^\d]', '', relpy_counts)) if relpy_counts else 0 except: relpy_counts = 0
the_time = response.xpath('//*[@id="ReviewTitle"]/div[1]/div[2]/text()')[0].strip().split('/')
two_weeks_play_time = the_time[0].strip().split('周')[1].split('小时')[0].strip() if len(the_time) > 0 else None
all_time = the_time[1].strip().split(' ')[1] if len(the_time) > 1 else None
return user_name,url,valuation,recommend,status,help,happy,posted_time,updated_time,relpy_counts,two_weeks_play_time,all_time,*user_info(user_url,header)
def load_to_sql(urls): defeat_url = [] for url in urls: try: row = get_info(url, header) save_to_db(row) print(f"[√] 已写入/更新:{url}") except Exception as e: print(f"[×] 处理失败:{url},错误:{e}") print("等待10秒......") defeat_url.append(url) time.sleep(10) return defeat_url
if __name__ == '__main__': init_db()
urls = read_urls_from_file('评论URL.txt.txt')
defeat_url = load_to_sql(urls) while len(defeat_url) != 0: defeat_url = load_to_sql(defeat_url)
print("爬取失败的URL:",defeat_url)
|