import re,requests,json
url='http://dianying.2345.com/top/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
r=requests.get(url,headers=headers)
r.encoding='gb2312' #解决中文乱码问题,其来着网页源码的<meta charset="gb2312">
ret=r.text
moban=re.compile('<li>.*?blank">(.*?)</a></span>.*?<span class="sIntro">(.*?)</span>', re.S)
items = re.findall(moban,ret)
print(type(items))
#print(items)
for item in items:
item=str(item)
with open("C://Users/Administrator/Desktop/"+'2345电影排名爬取.txt', 'a', encoding='utf-8') as f:
f.write(item+"\t")
#f.write(json.dumps(item, ensure_ascii=False) + '\n')
# print(item[0],item[1],item[2])
结果展示