使用requests,lxml对电影天堂网站数据的爬取
在这里插入代码片
# _*_ coding:utf _*_
# 邮箱:3195841740@qq.com
# 人员:21292
# 日期:2020/3/7 17:38
# 工具:PyCharm
from lxml import etree
import requests
import re
import json
movie = {}
# url = 'https://www.dytt8.net/html/gndy/jddy/20160320/50523.html'
headers = {
'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0'
}
def get_detail_urls(url):
response = requests.get(url ,headers = headers)
text = response.content.decode('gbk')
html = etree.HTML(text)
detail_urls = html.xpath('//font[@color = "#ff0000"]/p//@href')
for detail_url in detail_urls[:-4]:
print(detail_url)
spider(detail_url)
def spider (url):
response = requests.get(url,headers = headers)
text = response.content.decode('gbk')
html = etree.HTML(text)
name = html.xpath('//div[@class = "co_area2"]/div[1]/h1/font/text()')
name_detsil = re.findall(r'[《](.*?)[》]',name[0])[0]
movie['电影名称'] = str(name_detsil)
infos = html.xpath('//div[@id = "Zoom"]//text()')
for index,info in enumerate(infos):
if info.startswith('◎年 代'):
info = info.replace('◎年 代','').strip()
movie['年代'] = info
elif info.startswith('◎产 地'):
info = info.replace('◎产 地','').strip()
movie['产地'] = info
elif info.startswith('◎语 言'):
info = info.replace('◎语 言','').strip()
movie['语言'] = info
elif info.startswith('◎类 别'):
info = info.replace('◎类 别','').strip()
movie['类别'] = info
elif info.startswith('◎豆瓣评分'):
info = info.replace('◎豆瓣评分','').strip()
movie['豆瓣评分'] = info
elif info.startswith('◎片 长'):
info = info.replace('◎片 长','').strip()
movie['片长'] = info
elif info.startswith('◎片 长'):
info = info.replace('◎片 长','').strip()
movie['片长'] = info
elif info.startswith('◎导 演'):
info = info.replace('◎导 演','').strip()
movie['导演'] = info
elif info.startswith('◎主 演'):
info = info.replace('◎主 演','').strip()
actors = []
actors.append(info)
for x in range(index+1,len(infos)):
actor = infos[x].strip()
if actor.startswith('◎'):
break
actors.append(actor)
movie['主演'] = actors
elif info.startswith('◎简 介 '):
profile = infos[index+1].strip()
movie['简介'] = profile
print("*"*30)
for each in movie:
if each == '主演':
for x in range(0,len(movie[each])):
if x == 0:
print(each,':',movie[each][x])
else:
print('\t',movie[each][x])
else:
print(each,':',movie[each])
print("*" * 30)
with open('电影文件.json', 'a', encoding='utf-8') as fp:
fp.write(json.dumps(dict(movie), indent=2, ensure_ascii=False) + ',\n')
def start():
url = 'https://www.dytt8.net/html/gndy/jddy/20160320/50523.html'
get_detail_urls(url)
if __name__ == '__main__':
start()
在爬取的过程中,发现网页请求很困难,因此,数据有时请求不出来。
使用相同方法抓取 豆瓣电影
这个比抓取电影天堂快很多。