#!/usr/bin/env python
# -*- coding: utf-8 -*-
# chinaplay的前十个游戏 游戏名字 原价 现价 折扣 爬下来输出到excel文件
import csv
import re
import ssl
from urllib import request as req
ssl._create_default_https_context = ssl._create_unverified_context
class CrawlChinaplay(object):
def fetch_html(self):
url = "https://chinaplay.store/?countBuy=1&genre2=actions&utm_expid=.D1txQMWbRSepiu4sNydAgA.1&utm_referrer="
html_contents = req.urlopen(url).read().decode("utf-8")
return html_contents
def analysis(self,html_contents):
root_regex = '<div class="product"[\s\S]*?熊猫点\s*</div>\s*</div>\s*</div>\s*</div>'
content_list = re.findall(root_regex, html_contents)
sub_regex = r'<div class="product"[\s\S]*?<div class="action-badge">(.*?)<[\s\S]*?<div class="game-title">[\t|\n]*(.*?)[\t|\n]*?</[\s\S]*?old_price">(.*?)<[\s\S]*?">(.*?)</span>'
result_list = [] # 初始化一个空列表
for content in content_list:
t_list = [] # 清空列表
result = re.match(sub_regex, content)
if result:
t_list.append(result.group(2))
t_list.append(result.group(3))
t_list.append(result.group(4))
t_list.append(result.group(1))
result_list.append(t_list)
return result_list
def sava_csv(self,result_list):
out = open("chinaplay.csv", "a+", newline="", encoding="utf-8")
csv_writer = csv.writer(out, dialect="excel")
csv_writer.writerow(['game_name', 'ori_price', 'sale_price', 'discount'])
i = 0
if len(result_list) > 9:
while i < 10:
csv_writer.writerow(result_list[i])
i += 1
else:
while i < len(result_list):
csv_writer.writerow(result_list[i])
i += 1
out.close()
def crawl(self):
html_content = self.fetch_html()
result_list = self.analysis(html_content)
self.sava_csv(result_list)
if __name__ == "__main__":
crawl_chinaplay = CrawlChinaplay()
crawl_chinaplay.crawl()
因为这个网站加载比较慢些,故爬取会慢一些
结果: