day23-爬虫3-PyQuery以及xpath

最新推荐文章于 2024-03-14 12:44:56 发布

??fengyu

最新推荐文章于 2024-03-14 12:44:56 发布

阅读量132

点赞数

文章标签： python

本文链接：https://blog.csdn.net/qq_46137199/article/details/117373609

版权

day23

总结

# Pyquery

from pyquery import PyQuery

# 1. 创建PyQuery对象
# PyQuery对象的本质是一个容器
# pd对应的容器中只有一个html标签
# pq = PyQuery('需要解析的hml字符串')

with open(r'./temp.html', 'r', encoding='utf-8') as f:
    html = f.read()

pq = PyQuery(html)
print(type(pq))  # <class 'pyquery.pyquery.PyQuery'>

# 2. 通过选择器获取子标签
# PyQuery对象(选择器)
p_pq = pq('p')
print(type(p_pq))  # <class 'pyquery.pyquery.PyQuery'>
print(p_pq)

a_pq = pq('a')
print(type(a_pq))  # <class 'pyquery.pyquery.PyQuery'>
# print(a_pq)

div_pq = pq('div')
# print(div_pq)
print(type(div_pq[0]))  # <class 'lxml.html.HtmlElement'>

li = pq('#listAll > div.layout-Module-container.layout-Cover.ListContent > ul > li')
title = li('div > a > div.DyListCover-content > div:nth-child(1) > h3')

# 3. 获取标签中的内容
# PyQuery对象.text()
print(title.text())

# 4. 获取标签属性
# PyQuery对象.attr(属性名)

result = li.attr('class')
print(result)       # layout-Cover-item 获取第一个li标签的class属性
print('='*40)

# 获取所有li标签的class属性
for l in li:
    print(PyQuery(l).attr('class'))

# lxml操作

from lxml import etree

# 1. 构建指定的数结构并且获取根节点
html = etree.HTML(open('temp.html', encoding='utf-8').read())

# 2. 根据路径获取根节点的路径
res = html.xpath('/html/body')
print(res)      # [<Element body at 0x216b0079500>]

# 3. 路径
# 绝对路径： /绝对路径
# 注意：绝对路径必须从树结构的根节点开始写（html树结构就是从html开始写）
res = html.xpath('/html/head')
print(res)      # [<Element head at 0x17b37a98780>]


# 相对路径：./相对路径
# 注意：相对路径从该节点开始，.   -   表示当前节点, ..   -   表示父节点
meta = res[0].xpath('./meta')
print(meta)

# 4. 获取文本值/text()
# 例如:
# span_texts = html.xpath('//span/text()')

# 5. 从指定路径下任意位置开始查找
span = html.xpath('//span')
print(span)
for s in span:
    print(s.xpath('./text()'))

# 6. 获取属性
# 获取属性  -   @属性名
for s in span:
    print(s.xpath('./@class'))

ids = html.xpath('//@id')
print(ids)


# 7 谓语  -   筛选条件
# 1) [N]    -   第N个
result = html.xpath('/html/body//a[1]/@href')
print(result)

# 2) [last()]   -   最后一个
#  [last() - N] -   倒数第N+1个

# 3) [position()<3]     -   前两个[1, 3)
# [position()<N]    -   前N-1个 [1, N)
# [position()>N]    -   [N, ∞)

# 8. [@属性名] -   拥有固定属性
result = html.xpath('//a[@class]')
print(result)

# 9. [@属性名=值]   -   指定固定属性的值条件可以是=, 也可以是>=, <=, >, <


# 10. * - 通配符,可以匹配到任何标签，任意属性
res = html.xpath('./body/*')
print(res)

attrs = html.xpath('//a/@*')
print(attrs)

# 11. |     -       或者
res = html.xpath('./body/*|./body/div/span')
res = html.xpath('./body/*/div/text()|./body/div/a/text()')

作业

# 爬斗鱼

import requests
from bs4 import BeautifulSoup
import json
import csv
import selenium
from selenium.webdriver import Chrome


def get_net_data(_url, _headers={}, _params={}):
    try:
        res = requests.get(_url, headers=_headers, params=_params)
        if res.status_code == 200:
            return res.text
        print(res)
    finally:
        res.close()


def get_json_data(_url, _headers={}, _params={}):
    try:
        res = requests.get(_url, headers=_headers, params=_params)
        if res.status_code == 200:
            return res.json()
        print(res)
    finally:
        res.close()


def analysis_data(html: str):
    soup = BeautifulSoup(html, 'lxml')
    print(type(soup))
    lis = soup.select('#listAll > div.layout-Module-container.layout-Cover.ListContent > ul > li')
    for li in lis:
        print(type(li))
        title = li.select_one('div > a > div.DyListCover-content > div:nth-child(1) > h3')
        print('title', title.get_text())
        anchor = li.select_one('div > a > div.DyListCover-content > div:nth-child(2) > h2 > div')
        print('anchor', anchor.get_text())
        hot = li.select_one('div > a > div.DyListCover-content > div:nth-child(2) > span > svg')
        print('hot', hot)
        tag = li.select_one('div > a > div.DyListCover-content > span')
        print('tag', tag)


if __name__ == '__main__':
    # url = 'https://www.douyu.com/g_LOL'
    # all_list_data = []
    # next_url = 'https://www.douyu.com/gapi/rkc/directory/mixList/2_1/1'
    # hearders = {
    #     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36'
    # }
    # for x in range(1, 11):
    #     next_url = f'https://www.douyu.com/gapi/rkc/directory/mixList/2_1/{x}'
    #     py_obj = get_json_data(next_url, hearders)
    #     # print(py_obj['data']['rl'])
    #     all_list_data.extend(py_obj['data']['rl'])
    #
    # with open(r'./lol.json', 'w', encoding='utf-8') as f:
    #     f.write(json.dumps(all_list_data))
    # analysis_data(html)

    with open(r'./lol.json', 'r', encoding='utf-8') as f:
        py_obj = json.loads(f.read())

    origin = 'https://www.douyu.com'
    new_list = []
    for obj in py_obj:
        one_data = [obj['rn'], obj['nn'], obj['ol'], f"{origin}{obj['url']}", obj['rs1'], obj['c2name']]
        new_list.append(one_data)
    with open(r'./lol.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['anchor', 'tag', 'hot', 'url', 'img', 'game'])
        writer.writerows(new_list)

# 爬豆瓣

import requests
from lxml import etree
import re
import csv


# 'https://movie.douban.com/top250?start=0&filter='

class CrawpDoubanTop250():
    def __init__(self, headers={}, params={}):
        self.url = 'https://movie.douban.com/top250?start=%s&filter='
        self.headers = headers
        self.params = params

    def get_urls(self, page):
        return self.url % page

    def parse_url(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None

    def get_content(self, page):
        url = self.get_urls(page)
        content = self.parse_url(url)
        return content

    def analysis_data(self, content):
        html = etree.HTML(content)
        lis = html.xpath('/html/body/div[3]/div[1]/div/div[1]/ol/li')
        all_info = []
        # 观察网页发现部分电影有些数据没有，列表不会存为空值，列表中数据不能对应，所以抛弃
        # res = li.xpath('./div/div[2]/div[1]/a/span[1]/text()|./div/div[2]/div[1]/a/span[2]/text()|./div/div[2]/div[1]/a/span[3]/text()|./div/div[1]/a/img/@src|./div/div[2]/div[1]/a/@href|./div/div[2]/div[2]/p[1]/text()|./div/div[2]/div[2]/div/*/text()|./div/div[2]/div[2]/p[2]/span/text()')
        for li in lis:
            list_res = []
            # res = li.xpath('./div/div[2]/div[1]/a/span[1]/text()|./div/div[2]/div[1]/a/span[2]/text()')
            res = li.xpath('./div/div[2]/div[1]/a/span[position()<3]/text()')
            list_res.extend(res)
            # 由网页发现该数据有的电影没有，如果没有那么列表中获取不到，用户列表形式存为csv时，数据不能完全对应，因此需要单独提出来进行判断，如没有需要在列表中填充空字符串
            res2 = li.xpath('./div/div[2]/div[1]/a/span[3]/text()')
            list_res.extend(res2 if res2 else [''])
            res3 = li.xpath('./div/div[1]/a/img/@src|./div/div[2]/div[1]/a/@href|./div/div[2]/div[2]/p[1]/text()|./div/div[2]/div[2]/div/*/text()')
            list_res.extend(res3)
            # 同res2情况
            res4 = li.xpath('./div/div[2]/div[2]/p[2]/span/text()')
            list_res.extend(res4 if res4 else [''])
            new_res = []
            for item in list_res:
                item = re.sub(r'(^\s*/?)?\s+', ' ', item).strip()
                new_res.append(item)
            all_info.append(new_res)
        return all_info


if __name__ == '__main__':
    all_data = []
    top25 = CrawpDoubanTop250()
    for x in range(0, 226, 25):
        data = top25.get_content(x)
        all_data.extend(top25.analysis_data(data))

    with open(r'./top25.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['title', 'other_title', 'other', 'img', 'url', 'director', 'movie_info', 'grade', 'number', 'subhead'])
        writer.writerows(all_data)

??fengyu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
day23-爬虫3-PyQuery以及xpath

day23总结# Pyqueryfrom pyquery import PyQuery# 1. 创建PyQuery对象# PyQuery对象的本质是一个容器# pd对应的容器中只有一个html标签# pq = PyQuery('需要解析的hml字符串')with open(r'./temp.html', 'r', encoding='utf-8') as f: html = f.read()pq = PyQuery(html)print(type(pq)) # &l
复制链接

扫一扫