这里我只小小的爬取了 一页
import requests
import re
import time
from lxml import html
import csv
#1.进入总的电影页面
def url_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Cookie':''}
resp = requests.get(url, headers=headers)
return resp.text
#2.在总的电影页面得到具体的url
def data(txt):
pat = '<p class="name"><a href="(.*?)"'
url = re.compile(pat).findall(txt)
urls=[]
for i in url:
u='https://maoyan.com'+i
urls.append(u)
return urls
#3.进入详细页面得到数据 这里因为时间和上映地方是一起的 爬取后 可以在excel表格中分开
def xinxi(url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Cookie':''}
txt=requests.get(url,headers=headers).text
text=html.etree.HTML(txt)
name=text.xpath('//h1/text()')[0]
palce=text.xpath('//li[@class="ellipsis"][3]/text()')[0]
xinxis={"电影名":name,"上映地方":palce}
return xinxis
def main():
basis_url='https://maoyan.com/board/4?offset='
headers = ('电影名', '上映地方')
with open('文件名字', 'a', encoding='utf-8',newline='') as fp: #在这里建立文件 可以避免在下面建立文件重复打开文件
writer = csv.DictWriter(fp, headers)
writer.writeheader()
for i in range(0,10,10): #发现url规律是 以 10 递增
url=basis_url+str(i)
txt=url_data(url)
urls=data(txt)
time.sleep(5)
for j in urls:
information=xinxi(j)
time.sleep(10)
# 写入csv文件
writer.writerow(information)
print('已成功一部')
if __name__ == '__main__':
main()