day4-解析器和selenium的使用

01-爬取斗鱼直播


import requests
from bs4 import BeautifulSoup
import csv
import re
import json


def get_net_data():
    url = 'https://www.douyu.com/g_LOL'
    response = requests.get(url)
    if response.status_code == 200:
        return response.text

    print('数据获取失败!')


def analysis_data(html: str):
    # 解析方式一:直接解析json数据
    result = re.findall(r'window\.\$DATA\s*=\s*(\{.*\});', html)
    with open('test.txt', 'w', encoding='utf-8') as f:
        f.write(result[0])
    data = json.loads(result[0])
    all_data = []
    for item in data['list']:
        title = item['rn']
        anchor = item['nn']
        hot = item['ol']
        tag = item.get('od', '暂时还没有描述')
        image = item['rs1']
        url = f'https://www.douyu.com/topic/lolzxz?rid={item["url"][1:]}'
        all_data.append([title, anchor, hot, tag, image, url])

    print(all_data)
    return all_data

    # 解析方式二:解析网页源代码
    # soup = BeautifulSoup(html, 'lxml')
    # # 拿到所有产品对应的li标签
    # li_list = soup.select('.layout-Cover-list>li')
    # all_data = []
    # for li in li_list:
    #     # 标题
    #     title = li.select_one('.DyListCover-intro').get_text()
    #     # 主播
    #     anchor = li.select_one('.DyListCover-userName').get_text()
    #     # 热度
    #     hot = li.select_one('.DyListCover-hot').get_text()
    #     # 认证
    #     tag = li.select_one('.HeaderCell-label-wrap.is-od')
    #     if tag:
    #         tag = tag.get_text()
    #     else:
    #         tag = '暂时还没有描述'
    #     # print(tag)
    #     all_data.append([title, anchor, hot, tag])
    # return all_data


def save_data(data):
    with open('data.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['标题', '主播', '热度', '认证', '图片地址', '播放地址'])
        writer.writerows(data)


if __name__ == '__main__':
    data = analysis_data(get_net_data())
    save_data(data)

02-pyQuery


from pyquery import PyQuery

# 1.创建PyQuery对象
# PyQuery本质是一个容器,容器里放标签
# PyQuery(需要解析的html字符串)
html = open('test.html',encoding='utf-8').read()
pq = PyQuery(html)  #pq 对应的容器中只有一个html标签
# print(type(pq))  # <class 'pyquery.pyquery.PyQuery'>
# print(pq)

# 2.通过选择器获取子标签

# PyQuery对象(css选择器)
# 要分清pyquery对象,打印类型
ps = pq('p')
print(ps,type(ps))   # 类型pyquery对象 <class 'pyquery.pyquery.PyQuery'>

print(ps[0])  # <Element p at 0x24f1e0b2f98>
for p in ps:
    print(p)  # <Element p at 0x27cb8cb70e8> <Element p at 0x27cb8cb7138><Element p at 0x27cb8cb7188>

# 在整个html中找 b标签(pq对应的容器只有html一个标签)
print(pq('b'))  # <b>段落1</b><b>加粗</b>

# 在所有 p 中找 b标签(ps对应的容器是整个页面的p标签)
print(ps('b'))  # <b>段落1</b>

# print(ps[0]('b'))  # 报错,因为ps[0] 不是pyquery对象
print(PyQuery(ps[0])('b'))  # <b>段落1</b>


a_list = pq('a')
p_list = pq('p')
print(a_list)


# 3.获取标签内容
# PyQuery对象.text()
result = pq('h1').text()
print(result)

result = pq('a').text()
print(result,type(result))  # 这是超链接1 这是超链接2 <class 'str'>

# 单独获取每个a标签的内容
for a in pq('a'):
    print(PyQuery(a).text())
# 这是超链接1
# 这是超链接2

# 4.获取标签属性
# PyQuery 对象.attr(属性名)
result = pq('a').attr('href')
print(result)
# http://www.baidu.com

# 获取每个a标签的href属性
for a in pq('a'):
    print(PyQuery(a).attr('href'))
# http://www.baidu.com
# http://www.sohu.com

03-xpath

"""
Time: 2021/5/27 上午11:52
Author: 酒坛坛儿
Good Good Study, Day Day up !
"""
from lxml import etree

# 1.构建指定的树结构并且获取根节点
html = etree.HTML(open('test.html', encoding='utf-8').read())

# 2.根据路径获取节点(标签)
# 节点对象.xpath(路径)  -  获取指定路径对应的所有节点对象对应列表
a_list = html.xpath('/html/body/div/a')
print(a_list)

# 3.路径
# 1)绝对路径: /绝对路径
# 注意:绝对路径必须从树结构的根节点开始写(html树结构就是从html开始写)
result1 = html.xpath('/html/body/h1')
result2 = a_list[0].xpath('/html/body/h1')
print(result1)   # [<Element h1 at 0x10e725600>]
print(result2)   # [<Element h1 at 0x10e725600>]

result3 = html.xpath('/body/h1')    # 绝对路径只能从html开始
print(result3)    # []

# 2)相对路径
# . - 表示当前节点(谁去点的xpath当前节点就是谁), ./可以省略
# ..  - 标签当前节点的上层节点
result4 = html.xpath('./body/h1')
print(result4)    # [<Element h1 at 0x104de7700>]

div = html.xpath('/html/body/div')[0]
print(div)     # <Element div at 0x101771180>

img1 = div.xpath('/html/body/div/img')
print(img1)   # [<Element img at 0x10d5cd500>]
img2 = div.xpath('./img')
print(img2)   # [<Element img at 0x10d5cd500>]

result5 = div.xpath('../b')
print(result5)   # [<Element b at 0x108cc8480>]

# 补充:路径的最后加'/text()' 是用来获取标签内容
p = html.xpath('./body/p/text()')
print(p)   # ['我是段落1', '我是段落2']

# 3)// - 全文检索
# //p  - 获取整个页面中所有的p标签对应的节点
# //div/p   - 获取整个页面中在div下面的p标签的
result6 = html.xpath('//p')
print(result6)    # [<Element p at 0x105ac1740>, <Element p at 0x105ac1780>, <Element p at 0x105ac17c0>]

result7 = div.xpath('//p')
print(result7)   # [<Element p at 0x105ac1740>, <Element p at 0x105ac1780>, <Element p at 0x105ac17c0>]

result8 = html.xpath('/html/body/div//p/text()')
print(result8)

result9 = div.xpath('.//p/text()')
print(result9)

# 3.获取内容和属性
# 获取内容 - 在路径的最后添加 /text()
# 获取属性 - 在路径的最后添加 /@属性名
result10 = html.xpath('//a/text()')
print(result10)   # ['我是超链接1', '京东', '淘宝']

result11 = html.xpath('//a/@href')
print(result11)   # ['https://www.baidu.com', 'https://www.jd.com', 'https://www.jd.com']

print(html.xpath('//@id'))   # ['id1', 'id2', 'id3']

# 4.谓语 - 筛选条件
# 1)[N] - 获取第N个指定标签
result = div.xpath('./a[1]/text()')
print(result)

result = html.xpath('./body/div/p/text()')
print(result)   # ['我是段落3', '我是段落5', '我是段落11', '我是段落22']

result = html.xpath('./body/div/p[1]/text()')
print(result)   # ['我是段落3', '我是段落11']

result = html.xpath('./body/div[2]/p/text()')
print(result)  # ['我是段落11', '我是段落22']

result = html.xpath('./body/div[2]/p[1]/text()')
print(result)    # ['我是段落11']

# [last()]  - 最后一个
# [last()-1]  -  倒数第2个
# [last()-N]  - 倒数第N+1个
result = html.xpath('./body/div/p[last()]/text()')
print(result)   # ['我是段落5', '我是段落22']

# [position()<3]  -  前两个
# [position()<N]  -  前N-1个
result = html.xpath('./body/div[2]/p[position()<3]/text()')
print(result)   # ['我是段落11', '我是段落22']

result = html.xpath('./body/div[2]/p[position()>=3]/text()')
print(result)    # ['我是段落33', '我是段落44', '我是段落55']

# [@属性名]  - 拥有指定属性
result = html.xpath('./body/div[2]/p[@class]/text()')
print(result)   # ['我是段落22', '我是段落44', '我是段落55']

# [@属性名=值]  - 指定属性是指定值的标签
result = html.xpath('./body/div[2]/p[@class="c1"]/text()')
print(result)   # ['我是段落22', '我是段落44']

# [子标签名>值]、[子标签名>=值]、[子标签名<值]、[子标签名<=值]、[子标签名=值]
# 按照子标签内容进行筛选
result = html.xpath('./body/div[last()]/li[span=150]/p/text()')
print(result)


# 5. 通配符 - *
# *表示任意节点任意属性
result = html.xpath('./body/div[1]/*')
print(result)

result = html.xpath('./body/*/*')
print(result)

result = html.xpath('//img/@*')
print(result)

# 6.| - 分支
result = html.xpath('./body/div[1]/p/text()|./body/div[1]/a/text()')
print(result)   # ['我是段落3', '京东', '淘宝', '爱奇艺', '我是段落5']

div = html.xpath('./body/div[1]')[0]
print(div.xpath('./p/text()|./a/text()'))   # ['我是段落3', '京东', '淘宝', '爱奇艺', '我是段落5']


# result = html.xpath('./body/div[1]/(p|a)/text()')
# print(result)

# result = html.xpath('./body/div[last()]/li/(span|p)/text()')
# print(result)

04-selenium

from selenium import webdriver

# 创建谷歌浏览器
b=webdriver.Chrome()
b.get('http://www.jd.com')
print(b.page_source)

05-作业-top250图片下载


from lxml import etree
import requests

html = etree.HTML(open('top250.html',encoding='utf-8').read())

img_list = html.xpath('/html/body//img/@src')
print(img_list)

# title_list =html.xpath('/html/body//div[@class="hd"]/a/span[@class="title"]/text()')
# print(title_list)

# //span[@class="title"]


for index in range(len(img_list[:-1])):
    response1 = requests.get(img_list[index])
    with open(f'./img/{index}.jpg', 'wb') as f:
        f.write(response1.content)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值