检查网页源代码
首先分析网址的数据请求:
进入到新URL:
可发现是json数据格式(需要导入json)
import scrapy
import json
from scrapy import Request
from scrapy.http import TextResponse
items的使用
在items.py文件中,添加代码
import scrapy
class BaskStation(scrapy.Item):
no_str = scrapy.Field()
league = scrapy.Field()
datetime = scrapy.Field()
home = scrapy.Field()
away = scrapy.Field()
a_sf = scrapy.Field()
h_sf = scrapy.Field()
battle = scrapy.Field()
a_rate = scrapy.Field()
h_rate = scrapy.Field()
a_ds = scrapy.Field()
h_ds = scrapy.Field()
使用json提取元素
name = "bask_station" # 爬虫名称
allowed_domains = ["webapi.sporttery.cn"]
start_urls = ["https://webapi.sporttery.cn/gateway/jc/basketball/getMatchListV1.qry"]
custom_settings = {
"ITEM_PIPELINES": {
"lottery_crawls.pipelines.pipelines_game.BaskStationPipeline": 300,
},
"DOWNLOADER_MIDDLEWARES": {
"lottery_crawls.middlewares.LotteryCrawlsDownloaderMiddleware": 300,
}
}
当爬取其他页时,URL地址会改变,如果不在allowed_domains
中,就不能爬取,所以要修改allowed_domains
中的URL地址
def start_requests(self):
yield Request(
url=f"https://webapi.sporttery.cn/gateway/jc/basketball/getMatchListV1.qry?clientCode=3001&leagueId=1"
)
可以根据自己的需要 重写此方法,来实现起始请求的其他功能 或者加入头部信息 或者是其他参数 等等
def parse(self, response: TextResponse, **kwargs):
date_group = json.loads(response.text)["value"]["matchInfoList"]
for group in date_group:
items = group["subMatchList"]
for item in items:
basketball_item = BaskStation()
basketball_item["no_str"] = item["matchNumStr"]
basketball_item["datetime"] = str(item["matchDate"])[5:16] + " " + item["matchTime"]
basketball_item["home"] = item["homeTeamAbbName"]
basketball_item["away"] = item["awayTeamAbbName"]
basketball_item["league"] = item["leagueAbbName"]
match_id = item["matchId"]
yield Request(
url=f"https://webapi.sporttery.cn/gateway/uniform/basketball/getMatchFeatureV1.qry?termLimits=10&gmMatchId={match_id}",
meta={'key': basketball_item},
callback=self.data_analysis,
dont_filter=True
)
callback只能写函数名字符串, callback='parse_item'
在基本的spider中,如果重新发送请求,那里的callback写的是 callback=self.parse_item
新地址的json请求数据如下:
def data_analysis(self, response: TextResponse):
item = response.meta['key']
date_group = json.loads(response.text)["value"]
# 客队场均得失分
a_d = date_group["scoreAvg"]["awayGoalAvgCnt"]
a_s = date_group["lossScoreAvg"]["awayLossGoalAvgCnt"]
item["a_ds"] = a_d + "/" + a_s
# 主队场均得失分
h_d = date_group["scoreAvg"]["homeGoalAvgCnt"]
h_s = date_group["lossScoreAvg"]["homeLossGoalAvgCnt"]
item["h_ds"] = h_d + "/" + h_s
# 客队近期胜负
a_S = str(date_group["eachHomeAway"]["awayWinGoalMatchCnt"])
a_F = str(date_group["eachHomeAway"]["awayLossGoalMatchCnt"])
item["a_sf"] = a_S + "胜/" + a_F + "负"
# 主队近期胜负
h_S = str(date_group["eachHomeAway"]["homeWinGoalMatchCnt"])
h_F = str(date_group["eachHomeAway"]["homeLossGoalMatchCnt"])
item["h_sf"] = h_S + "胜/" + h_F + "负"
# 客队胜率 主队胜率
item["a_rate"] = date_group["eachHomeAway"]["awayScoreRatio"] + "%"
item["h_rate"] = date_group["eachHomeAway"]["homeScoreRatio"] + "%"
# 交锋战绩
home = date_group["homeTeamShortName"]
h_battle_s = str(date_group["last"]["homeWinGoalMatchCnt"])
h_battle_f = str(date_group["last"]["homeLossGoalMatchCnt"])
item["battle"] = "主队" + home + h_battle_s + "胜" + h_battle_f + "负"
yield item
在pipelines.py文件中连接数据库
使用pymysql连接数据库,并使用管道将数据保存到数据库
pymysql的使用:
1. conn 连接 1). 端口号必须是整型 2). 字符集不允许加-
2. cursor 游标
3. cursor.execute(sql)
4. conn.commit()
5. cursor.close()
6. conn.close()
def __init__(self):
self.tableName = 'game_baskstation'
self.data = []
self.conn = pymysql.connect(host=conn.get("host"), port=conn.get("port"),
user=conn.get("user"), password=conn.get("password"),
database=conn.get("database"), charset=conn.get("charset"))
self.cursor = self.conn.cursor()
def close_spider(self, spider):
self.conn.close()
def process_item(self, item, spider):
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
no_str = item["no_str"]
league = item["league"]
datetime_ = item["datetime"]
home = item["home"]
away = item["away"]
a_sf = item["a_sf"]
h_sf = item["h_sf"]
battle = item["battle"]
a_rate = item["a_rate"]
h_rate = item["h_rate"]
a_ds = item["a_ds"]
h_ds = item["h_ds"]
try:
self.cursor.execute(
f"SELECT id FROM `{self.tableName}` WHERE no_str = '{no_str}' and league = '{league}' and home = '{home}' and away ='{away}' and datetime_ ='{datetime_}'")
ret = self.cursor.fetchone()
if ret:
logging.info(f"update--{ret[0]}")
self.data.append(
(current_time, datetime_, a_sf, h_sf, battle, a_rate, h_rate, a_ds, h_ds)
)
self._write_to_db_update(ret)
self.data.clear()
else:
logging.info(f"insert--")
self.data.append(
(current_time, no_str, league, datetime_, home, away, a_sf, h_sf, battle, a_rate, h_rate, a_ds, h_ds)
)
self._write_to_db_insert()
self.data.clear()
return item
except Exception as error:
logging.error(error)
新增数据模块
def _write_to_db_insert(self):
self.cursor.execute(
f"INSERT INTO `{self.tableName}` (create_time, no_str, league, datetime_, home, away, a_sf, h_sf, battle, a_rate, h_rate, a_ds, h_ds)"
" VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
self.data[0]
)
self.conn.commit()
数据更新模块
def _write_to_db_update(self, ret):
sql = f"update `{self.tableName}` set " \
"update_time=(%s), datetime_=(%s), a_sf=(%s), h_sf=(%s), battle=(%s), a_rate=(%s), h_rate=(%s), a_ds=(%s), h_ds=(%s)" \
f" where id = {ret[0]}"
self.cursor.execute(sql, self.data[0])
self.conn.commit()
在settings.py文件中添加数据库的配置
"""
数据库连接
"""
conn = {
"host": "",
"port": 3306,
"user": "dev",
"password": "123456",
"database": "crawler",
"charset": "utf8mb4"
}
在settings.py文件中添加该管道
ITEM_PIPELINES = {
"lottery_crawls.pipelines.pipelines_game.BaskStationPipeline": 300,
}
运行此爬虫(可新建debugger.py)
from scrapy import cmdline
cmdline.execute("scrapy crawl bask_station".split(' '))
最终爬取到的结果:
附:Scrapy提供了默认的UserAgentMiddleware,用于设置请求的UA。如果没有特别指定,Scrapy会使用预定义的UA字符串,或者可以在settings.py中通过USER_AGENT设置全局UA。
USER_AGENT_LIST = [
'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]