python 学习笔记(一)——Requests 库网络爬虫

学习 python 网络爬虫第一天

本博客纯用于学习记录,无其它用途。

用到的库

  1. requests 库,requests 库是 python 自带库,无需安装。
  2. lxml 库,安装调用 pip 

 

pip install lxml

 案例:电影天堂

# encoding: utf-8

import requests
from lxml import etree

BASE_DOMAIN = 'https://www.dytt8.net/'  # 定义全局变量 第 25 行用到
HEADERS = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}


# 获取所有页面的url函数
def get_detail_urls(url):
    # url = "https://www.dytt8.net/html/gndy/dyzz/list_23_3.html"

    response = requests.get(url, headers=HEADERS)
    # text = response.content.decode('gbk') # 有乱码是无法解释
    text = response.text
    # print(text)
    html = etree.HTML(text)  # 对text进行解码
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")  # //a是取出table标签下所有的a标签
    # for detail_url in detail_urls:
    #    print(BASE_DOMAIN+detail_url)
    # for循环可用lambda表达式代替...
    detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)
    return detail_urls


# 解析函数 104 行用到
def parse_detail_page(url):
    movie = {}
    response = requests.get(url, headers=HEADERS)
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movie['title'] = title

    # print(title)
    # 转码(转成文字),取这行代码/text()后可以不用转码,只取出文字部分
    # for x in title:
    #     print(etree.tostring(x,encoding='utf-8').decode("utf-8"))

    def parse_info(info, rule):
        # replace替换作用,strip删掉前后空格作用
        return info.replace(rule, "").strip()

    zoomE = html.xpath("//div[@id='Zoom']")[0]
    img = zoomE.xpath(".//img/@src")
    # print(img)
    infos = zoomE.xpath(".//text()")  # 取出来是一个列表
    # print(infos)

    for index, info in enumerate(infos):
        # print(info)
        if info.startswith("◎年  代"):
            # startswith("xx") 以xx开始
            # print(info)
            info = parse_info(info, "◎年  代")
            movie['year'] = info
        elif info.startswith("◎产  地"):
            info = parse_info(info, "◎产  地")
            movie['country'] = info
        elif info.startswith("◎类  别"):
            info = parse_info(info, "◎类  别")
            movie['category'] = info
        elif info.startswith("◎豆瓣评分"):
            info = parse_info(info, "◎豆瓣评分")
            movie['douban_goal'] = info
        elif info.startswith("◎片  长"):
            info = parse_info(info, "◎片  长")
            movie['length'] = info
        elif info.startswith("◎导  演"):
            info = parse_info(info, "◎导  演")
            movie['diractor'] = info
        elif info.startswith("◎主  演"):
            info = parse_info(info, "◎主  演")
            actors = [info]
            for x in range(index + 1, len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            # print(actors)
            movie['actors'] = actors
        elif info.startswith("◎简  介"):
            info = parse_info(info, "◎简  介")
            profile = [info]
            for x in range(index + 1, len(infos)):
                profile = infos[x].strip()
                if profile.startswith("◎获奖情况"):
                    break
                movie['profile'] = profile
    download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
    movie['download_url'] = download_url
    return movie

    # pass  # ignore the def


def spider():
    base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    movies = []
    for x in range(3, 5):
        # 控制7页
        url = base_url.format(x)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            # 遍历一页中所有电影的详情url
            # print(detail_url)
            movie = parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)


if __name__ == '__main__':
    spider()

推荐阅读:

python 学习笔记(二)——Beautifulsoup 库,网络爬虫

python 学习笔记(三)——读写csv文件

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值