回头来复习一下正则表达式,选了猫眼电影来练练手
import requests
import re
import csv
import codecs
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36',
'Referer': 'https://maoyan.com/board/4?offset=90'
}
with open("F://猫眼电影.csv","ab+") as f:
f.write(codecs.BOM_UTF8)
fp = open('F://猫眼电影.csv','a+',newline='',encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(['电影','演员','上映时间','评分'])
def explian_HTML(url):
print(url)
response = requests.get(url,headers = headers)
titles = re.findall('<p class="name">.*?<.*?title="(.*?)"',response.text,re.S)
authors = re.findall('<p class="star">\s*(.*?)\s*</p>',response.text)
times = re.findall('<p class="releasetime">(.*?)</p>',response.text,re.S)
point_1 = re.findall('<i class="integer">(.*?)</i>',response.text)
point_2 = re.findall('<i class="fraction">(.*?)</i>',response.text)
for a,b,title,author,time in zip(point_1,point_2,titles,authors,times):
point = a+b
writer.writerow([title,author,time,point])
if __name__ == '__main__':
urls = ["https://maoyan.com/board/4?offset={}".format(i) for i in range(0, 91, 10)]
for url in urls:
explian_HTML(url)
time.sleep(0.5)#设置休眠时间,否则会过快而爬不下来
fp.close()