学习 python 网络爬虫第一天
本博客纯用于学习记录,无其它用途。
用到的库
- requests 库,requests 库是 python 自带库,无需安装。
- lxml 库,安装调用 pip
pip install lxml
案例:电影天堂
# encoding: utf-8
import requests
from lxml import etree
BASE_DOMAIN = 'https://www.dytt8.net/' # 定义全局变量 第 25 行用到
HEADERS = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
# 获取所有页面的url函数
def get_detail_urls(url):
# url = "https://www.dytt8.net/html/gndy/dyzz/list_23_3.html"
response = requests.get(url, headers=HEADERS)
# text = response.content.decode('gbk') # 有乱码是无法解释
text = response.text
# print(text)
html = etree.HTML(text) # 对text进行解码
detail_urls = html.xpath("//table[@class='tbspan']//a/@href") # //a是取出table标签下所有的a标签
# for detail_url in detail_urls:
# print(BASE_DOMAIN+detail_url)
# for循环可用lambda表达式代替...
detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)
return detail_urls
# 解析函数 104 行用到
def parse_detail_page(url):
movie = {}
response = requests.get(url, headers=HEADERS)
text = response.content.decode('gbk')
html = etree.HTML(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie['title'] = title
# print(title)
# 转码(转成文字),取这行代码/text()后可以不用转码,只取出文字部分
# for x in title:
# print(etree.tostring(x,encoding='utf-8').decode("utf-8"))
def parse_info(info, rule):
# replace替换作用,strip删掉前后空格作用
return info.replace(rule, "").strip()
zoomE = html.xpath("//div[@id='Zoom']")[0]
img = zoomE.xpath(".//img/@src")
# print(img)
infos = zoomE.xpath(".//text()") # 取出来是一个列表
# print(infos)
for index, info in enumerate(infos):
# print(info)
if info.startswith("◎年 代"):
# startswith("xx") 以xx开始
# print(info)
info = parse_info(info, "◎年 代")
movie['year'] = info
elif info.startswith("◎产 地"):
info = parse_info(info, "◎产 地")
movie['country'] = info
elif info.startswith("◎类 别"):
info = parse_info(info, "◎类 别")
movie['category'] = info
elif info.startswith("◎豆瓣评分"):
info = parse_info(info, "◎豆瓣评分")
movie['douban_goal'] = info
elif info.startswith("◎片 长"):
info = parse_info(info, "◎片 长")
movie['length'] = info
elif info.startswith("◎导 演"):
info = parse_info(info, "◎导 演")
movie['diractor'] = info
elif info.startswith("◎主 演"):
info = parse_info(info, "◎主 演")
actors = [info]
for x in range(index + 1, len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
# print(actors)
movie['actors'] = actors
elif info.startswith("◎简 介"):
info = parse_info(info, "◎简 介")
profile = [info]
for x in range(index + 1, len(infos)):
profile = infos[x].strip()
if profile.startswith("◎获奖情况"):
break
movie['profile'] = profile
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
movie['download_url'] = download_url
return movie
# pass # ignore the def
def spider():
base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
movies = []
for x in range(3, 5):
# 控制7页
url = base_url.format(x)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
# 遍历一页中所有电影的详情url
# print(detail_url)
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movie)
if __name__ == '__main__':
spider()
推荐阅读: