导入库
import pandas as pd
import re
import time
import requests
请求网页源代码
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'Cookie':'__mta=142325645.1630981333372.1634619282542.1634619295877.33; _lxsdk_cuid=17bbe10210b8f-01fdc7dd16f0e1-5734174f-144000-17bbe10210cc8; uuid_n_v=v1; uuid=78E04CC0300811ECBC330D5DE33F2BD53D9D77A3A30C459E98998D72D0D88200; _lxsdk=78E04CC0300811ECBC330D5DE33F2BD53D9D77A3A30C459E98998D72D0D88200; _csrf=2189585ecfe099d1d1f33d3dfeff9e6bc61a527269fd18b31d8a18afa2b4291f; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1634557351,1634557363,1634557481,1634619252; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; __mta=142325645.1630981333372.1634564791483.1634619253870.29; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1634619296; _lxsdk_s=17c9703d561-6ee-b38-802%7C%7C1',
'Content-Type': 'text/plain; charset=UTF-8',
'Origin':'https://maoyan.com',
'Referer':'https://maoyan.com/board/4'}
url="https://maoyan.com/board/4?offset=0"
response = requests.get(url, headers=headers)
response.text
单页爬取
x1=re.compile('<dd>.*?<img data-src="(.*?)".*?</a>',re.S)
photo = re.findall(x1,response.text)
print(photo) #图片链接
x2=re.compile('<dd>.*?class="name".*?title="(.*?)".*?</a>',re.S)
name = re.findall(x2,response.text)
print(name) #电影名称
x3=re.compile('<dd>.*?<p class="star">.*?主演:(.*?)</p>.*?</div>',re.S)
actor = re.findall(x3,response.text)
print(actor) #主演
x4=re.compile('<dd>.*?<p class="releasetime">.*?上映时间:(.*?)</p>.*?</div>',re.S)
time = re.findall(x4,response.text)
print(time) #时间
x5=re.compile('<dd>.*?<p class="score"><i class="integer">(.*?)</i>.*?</div>',re.S)
pingfen1 = re.findall(x5,response.text)
print(pingfen1)
x6=re.compile('<dd>.*?<p class="score">.*?<i class="fraction">(.*?)</i>.*?</div>',re.S)
pingfen2 = re.findall(x6,response.text)
print(pingfen2)
pingfen=[]
for i in range(0,len(pingfen1)):
pingfen.append(pingfen1[i]+(pingfen2[i]))
pingfen #评分
多页爬取
server=[]
for i in range(10):
url="https://maoyan.com/board/4?offset="+str(i*10)
session = requests.Session()
response = session.get(url, headers=headers)
web_text=response.text
server.append(web_text)
server
top = pd.DataFrame()
for element in server:
x1=re.compile('<dd>.*?<img data-src="(.*?)".*?</a>',re.S)
tupian = re.findall(x1,element) #海报链接
x2=re.compile('<dd>.*?class="name".*?title="(.*?)".*?</a>',re.S)
name = re.findall(x2,element) #电影名称
x3=re.compile('<dd>.*?<p class="star">.*?主演:(.*?)</p>.*?</div>',re.S)
actor = re.findall(x3,element)
x4=re.compile('<dd>.*?<p class="releasetime">.*?上映时间:(.*?)</p>.*?</div>',re.S)
time = re.findall(x4,element) #上映时间
x5=re.compile('<dd>.*?<p class="score"><i class="integer">(.*?)</i>.*?</div>',re.S)
pingfen1 = re.findall(x5,element)
x6=re.compile('<dd>.*?<p class="score">.*?<i class="fraction">(.*?)</i>.*?</div>',re.S)
pingfen2 = re.findall(x6,element)
pingfen=[]
for i in range(0,len(pingfen1)):
pingfen.append(pingfen1[i]+(pingfen2[i])) #电影评分
maoyan=pd.DataFrame({"海报链接":photo,
"电影名称":name,
"电影主演":actor,
"上映时间":time,
"电影评分":pingfen})
top=top.append(maoyan,ignore_index=True)
导入数据
top.to_excel("maoyan.xlsx")#导出数据
top.to_csv("maoyan.csv")#导出数据
以下是代码截图