电影天堂小爬虫
作用
用python爬取电影天堂的最新电影,并简单展示分析。电影信息首先存为.csv然后使用第三方库简单分析信息。
代码
from lxml import etree
import requests
import pandas_profiling
import pandas as pd
#设置网页信息
HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) \
Gecko/20100101 Firefox/68.0'
}
BASE_DOMAIN = 'https://www.ygdy8.net'
#url = BASE_DOMAIN + '/html/gndy/dyzz/index.html'
def get_detail_urls(url):
# 抓取网页
resp = requests.get(url, headers=HEADERS)
text = resp.content.decode('ISO-8859-1')
html = etree.HTML(text)
# 拿到电影详情页面的链接
details_urls = html.xpath("//table[@class='tbspan']//a/@href")
details_urls = map(lambda url: BASE_DOMAIN+url, details_urls)
return details_urls
def spider(filename_csv):
movies = []
movie = {}
base_url = 'https://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'
for x in range(1, 2):
url = base_url.format(x)
details_urls = get_detail_urls(url)
for details_url in details_urls:
movie = parser_detail_page(details_url)
movies.append(movie)
pd.DataFrame(movies).to_csv(filename_csv)
def parser_detail_page(url):
movie = {}
resp = requests.get(url, headers=HEADERS)
text = resp.content.decode('gbk')
html = etree.HTML(text)
title = html.xpath("//title/text()")[0]
# print(title)
movie['title'] = title
div_zoom = html.xpath("//div[@id='Zoom']")[0]
cover, screenshot = div_zoom.xpath(".//img/@src")
# print(cover, screenshot)
movie['cover'] = cover
movie['screenshot'] = screenshot
infos = div_zoom.xpath(".//text()")
for info in infos:
if info.startswith("◎年 代"):
year = info.replace('◎年 代 ', '')
movie['year'] = year
if info.startswith("◎类 别 "):
category = info.replace('◎类 别 ', '')
movie['category'] = category
return movie
if __name__ == "__main__":
spider('dytt.csv')
data = pd.read_csv('dytt.csv')
report = pandas_profiling.ProfileReport(data)
report.to_file('dytt.html')
``