python·数据采集·html(爬虫3)

斑马L*

已于 2023-04-08 15:28:49 修改

阅读量360

点赞数 2

分类专栏： python爬虫学习笔记文章标签： python xpath html

于 2021-07-13 11:28:57 首次发布

本文链接：https://blog.csdn.net/qq_47326711/article/details/118678124

版权

python爬虫学习笔记专栏收录该内容

5 篇文章 1 订阅

订阅专栏

python·数据采集·lxml（爬虫3）

导入插件工具
爬取电影天堂

导入插件工具

xpath（XML Path Language）是一门在XML和HTML文档中查找信息的语言，可用来在XML和HTML文档中对元素和属性进行遍历。
xpath helper:返回链接的信息
json-handle：解析json文件

JSON-Handle下载链接: JSON-Handle官网

在这里插入图片描述

XPath语法

选取节点：

表达式	描述	示例	结果
nodename	选取此节点的所有子节点	bookstore	选取bookstore下所有的子节点
/	如果是在最前面，代表从根节点选取。否则选择某节点下的某个节点	/bookstore	选取根元素下所有的bookstore节点
//	从全局节点中选择节点，随便在哪个位置	//book	从全局节点中找到所有的book节点
@	选取某个节点的属性	//book[@price]	选择所有拥有price属性的book节点
.	当前节点	./a	选取当前节点下的a标签

需要注意的知识点：

/和//的区别：/代表只获取直接子节点。//获取子孙节点。一般//用得比较多。当然也要视情况而定。
contains：有时候某个属性中包含了多个值，那么可以使用contains函数。示例代码如下：
```
//div[contains(@class,'job_detail')]
```
谓词中的下标是从1开始的，不是从0开始的。

谓语：

谓语用来查找某个特定的节点或者包含某个指定的值的节点，被嵌在方括号中。
在下面的表格中，我们列出了带有谓语的一些路径表达式，以及表达式的结果：

路径表达式	描述
/bookstore/book[1]	选取bookstore下的第一个子元素
/bookstore/book[last()]	选取bookstore下的倒数第二个book元素。
bookstore/book[position()❤️]	选取bookstore下前面两个子元素。
//book[@price]	选取拥有price属性的book元素
//book[@price=10]	选取所有属性price等于10的book元素

from lxml import etree

parser = etree.HTMLParser(encoding='utf-8')
html = etree.parse('tencent.html',parser=parser)
# print(etree.tostring(html,encoding='utf-8').decode('utf-8'))

# 1. 获取所有tr标签
trs = html.xpath("//tr") #xpath返回的一定是个列表  取出元素 一定要注意下标
for tr in trs:
    print(etree.tostring(tr,encoding='utf-8').decode('utf-8'))

# 2. 获取第2个tr标签
tr = html.xpath("//tr[2]")[0]
print(etree.tostring(tr,encoding='utf-8').decode('utf-8'))

# 3. 获取所有class等于even的tr标签
trs = html.xpath("//tr[@class='even']")
trs = html.xpath("//tr[contains(@class,'hubei')]") #属性中包含湖北这个值得
for tr in trs:
    print(etree.tostring(tr,encoding='utf-8').decode('utf-8'))

# 4. 获取所有a标签的href属性
aList = html.xpath("//a/@href")
for a in aList:
    print('http://hr.tencent.com/'+a)

# 5. 获取所有的职位信息（纯文本）
trs = html.xpath("//tr[position()>1]")
positions = []
for tr in trs:
    href = tr.xpath(".//a/@href")[0]
    fullurl = 'http://hr.tencent.com/'+href
    title = tr.xpath(".//td[1]//text()")[0]
    category = tr.xpath(".//td[2]/text()")[0]
    number = tr.xpath(".//td[3]/text()")[0]
    city = tr.xpath(".//td[4]/text()")[0]
    pubtime = tr.xpath(".//td[5]/text()")[0]
    position = {
        'title':title,
        'url':fullurl,
        'category':category,
        'number':number,
        'city':city,
        'pubtime':pubtime,
    }
    positions.append(position)
print(positions)

爬取电影天堂

import requests
from lxml import etree

headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    # 'Host': 'www.dytt8.net',
    # 'Referer':'https://www.dytt8.net/',
}

BASE_DOMAIN = 'https://www.dytt8.net'

def pasrse_page(url):
    """
    请求电影详情链接 发送 请求 获取详情内容
    :param url:
    :return:
    """
    # print(url)
    movie = {} # 存放一部电影的详细信息
    res = requests.get(url,headers=headers)
    data = res.content.decode('gbk')
    html = etree.HTML(data)
    title = html.xpath("/html/body/div[1]/div/div[3]/div[3]/div[1]/div[2]/div[1]/h1/font//text()")[0]
    movie['title'] = title
    Zoom = html.xpath("//div[@id='Zoom']")[0]
    cover = Zoom.xpath(".//img/@src")
    movie['cover'] = cover
    infos = Zoom.xpath(".//text()")
    for index,info in enumerate(infos):
        # print(index,info)
        # print("="*50)
        if info.startswith("◎年　　代"):
            year = info.replace("◎年　　代","").strip()
            movie['year'] = year
        elif info.startswith("◎产　　地"):
            country = info.replace("◎产　　地","").strip()
            # print(country)
            movie['country'] = country
        elif info.startswith("◎类　　别"):
            category = info.replace("◎类　　别","").strip()
            # print(category)
            movie['category'] = category
        elif info.startswith("豆瓣评分"):
            rating = info.replace("豆瓣评分","").strip()
            # print(rating)
            movie['rating'] = rating
        elif info.startswith("◎豆瓣评分"):
            rating = info.replace("◎豆瓣评分","").strip()
            # print(rating)
            movie['rating'] = rating
        elif info.startswith("◎片　　长"):
            duration = info.replace("◎片　　长","").strip()
            # print(duration)
            movie['duration'] = duration
        elif info.startswith("◎导　　演"):
            duration = info.replace("◎导　　演","").strip()
            # print(duration)
            movie['duration'] = duration
        elif info.startswith("◎主　　演"):
            info = info.replace("◎主　　演","").strip()
            # print(actor)
            actors = [info]
            for x in range(index+1,len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            movie['actors'] = actors
            # print(actors)
        elif info.startswith("◎简　　介"):
            info = info.replace("◎简　　介","").strip()
            for x in range(index+1,len(infos)):
                profile = infos[x].strip()
                if profile.startswith("磁"):
                    break
                movie['profile'] = profile
                # print(profile)
    download_url = Zoom.xpath('.//a/@href')[0]
    movie['download_url'] = download_url
    # print(movie)
    return movie
def get_detail_urls(url):
    """
    获取这一页中每个电影的详情链接
    :param url:
    :return: 返回一个列表 包含每个电影的详情链接
    """
    res = requests.get(url,headers=headers)
    # print(res.text)
    text = res.text
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = list(map(lambda url:BASE_DOMAIN+url,detail_urls))
    # print(detail_urls)
    return detail_urls
movies = []
for x in range(1,51):
    url = f"https://www.dytt8.net/html/gndy/dyzz/list_23_{x}.html"
    detail_urls = get_detail_urls(url)
    for detail_url in detail_urls:
        # print(detail_url)
        movie = pasrse_page(detail_url)
        print(movie)
        # movies.append(movie)
        # print(movies)