目标:爬取某网站比赛赛程,动态网页,则需找到对应ajax请求(具体可参考:https://blog.csdn.net/you_are_my_dream/article/details/53399949)
# -*- coding:utf-8 -*-
import sys
import re
import urllib.request
link = "https://***"
r = urllib.request.Request(link)
r.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36')
html = urllib.request.urlopen(r,timeout=500).read()
html = bytes.decode(html,encoding="gbk")
#返回大量json,需提取
#找出返回json中对应正则匹配的字符串
js = re.findall('"n":"(.*?)"',html)
i=0
#循环打印比赛信息
try:
while(1):#将字符串Unicode转化为中文,并输出
print (js[i].encode('utf-8').decode('unicode_escape'),js[i+1].encode('utf-8