Python3解析库Xpath爬取豆瓣top250每个电影详细介绍
思路:
首页的内容如图
发现每个电影都是一个li标签,li标签里面的有电影详细介绍的网址
所以我们只需要将每个页面中的li标签里面的爬取下来,然后对每个网址进行二次爬取。还有观察分页爬取的
发现只是网址的start变了
不说了上代码,我这里是爬取电影名,电影的导演,部分演员,上映时间,电影介绍等
import json
import requests
import re
from requests.exceptions import RequestException
import time
from lxml import etree
def write_to_file(content):
with open('wzhi.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
def get_one_page(url):
try:
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
response=requests.get(url,headers=headers)
print(response.status_code)
if response.status_code==200:
return response.text
return None
except RequestException:
return None
def crow(url):
html=get_one_page(url)
html=etree.HTML(html)
movieid=re.match('https://.*?/.*?/(.*?)/.*?',url)
moviename=html.xpath('//span[@property="v:itemreviewed"]/text()')
print(str(moviename).strip('[]'))
score=html.xpath('//div[@class="rating_self clearfix"]/strong/text()')
imgsrc=html.xpath('//div[@id="mainpic"]//img/@src')
irector=html.xpath('//span[@class="attrs"]/a[@rel="v:directedBy"]/text()')
Actors=html.xpath('//span[@class="attrs"]/a[@rel="v:starring"]/text()')
tyype=html.xpath('//span[@property="v:genre"]/text()')
country=html.xpath('//div[@id="info"]/text()')
Language=html.xpath('//div[@id="info"]/text()')
ReleaseTime=html.xpath('//span[@property="v:initialReleaseDate"]/text()')
sb=""+"a"+ReleaseTime[0]+"'"
sb=sb.replace('(','a')
sb=sb.replace(')','a')
rr=re.match('a(.*?)a.*?a',str(sb))
print(rr.group(1))
Synopsis=html.xpath('//div[@class="related-info"]//span[@property="v:summary"]/text()')
print(movieid.group(1))
print(irector[0])
#url='https://movie.douban.com/subject/26596486/'
#crow(url)
with open('wen.txt','a',encoding='utf-8') as f:
f.write(str(movieid.group(1))+";")
f.write("'"+moviename[0].strip()+"';")
f.write("'"+irector[0].strip()+"';")
f.write(""+str(Actors[0:5]).strip('[]')+";")
f.write(""+str(tyype[0:5]).strip('[]')+";")
f.write("'"+rr.group(1)+"';")
f.write("'"+imgsrc[0].strip()+"';")
f.write("'"+Synopsis[0].strip()+"';")
f.write(score[0].strip())
f.write("\n")
f.close()
def parse_one_page(html):
items=re.findall('<li>.*?<em.*?<a href="(.*?)">.*?</li>',html,re.S)
for item in items:
print(item)
crow(item)
def main(offset):
url='https://movie.douban.com/top250?start='+str(offset)+'&filter='
print(url)
html = get_one_page(url)
#print(html)
parse_one_page(html)
if __name__== '__main__':
for s in range(10):
main(offset=s*25)
运行结果:
里面用到了python的request库的方法等