import requests
from bs4 import BeautifulSoup
import csv
import re
import json
defget_net_data():
url ='https://www.douyu.com/g_LOL'
response = requests.get(url)if response.status_code ==200:return response.text
print('数据获取失败!')defanalysis_data(html:str):# 解析方式一:直接解析json数据
result = re.findall(r'window\.\$DATA\s*=\s*(\{.*\});', html)withopen('test.txt','w', encoding='utf-8')as f:
f.write(result[0])
data = json.loads(result[0])
all_data =[]for item in data['list']:
title = item['rn']
anchor = item['nn']
hot = item['ol']
tag = item.get('od','暂时还没有描述')
image = item['rs1']
url =f'https://www.douyu.com/topic/lolzxz?rid={item["url"][1:]}'
all_data.append([title, anchor, hot, tag, image, url])print(all_data)return all_data
# 解析方式二:解析网页源代码# soup = BeautifulSoup(html, 'lxml')# # 拿到所有产品对应的li标签# li_list = soup.select('.layout-Cover-list>li')# all_data = []# for li in li_list:# # 标题# title = li.select_one('.DyListCover-intro').get_text()# # 主播# anchor = li.select_one('.DyListCover-userName').get_text()# # 热度# hot = li.select_one('.DyListCover-hot').get_text()# # 认证# tag = li.select_one('.HeaderCell-label-wrap.is-od')# if tag:# tag = tag.get_text()# else:# tag = '暂时还没有描述'# # print(tag)# all_data.append([title, anchor, hot, tag])# return all_datadefsave_data(data):withopen('data.csv','w', newline='', encoding='utf-8')as f:
writer = csv.writer(f)
writer.writerow(['标题','主播','热度','认证','图片地址','播放地址'])
writer.writerows(data)if __name__ =='__main__':
data = analysis_data(get_net_data())
save_data(data)
02-pyQuery
from pyquery import PyQuery
# 1.创建PyQuery对象# PyQuery本质是一个容器,容器里放标签# PyQuery(需要解析的html字符串)
html =open('test.html',encoding='utf-8').read()
pq = PyQuery(html)#pq 对应的容器中只有一个html标签# print(type(pq)) # <class 'pyquery.pyquery.PyQuery'># print(pq)# 2.通过选择器获取子标签# PyQuery对象(css选择器)# 要分清pyquery对象,打印类型
ps = pq('p')print(ps,type(ps))# 类型pyquery对象 <class 'pyquery.pyquery.PyQuery'>print(ps[0])# <Element p at 0x24f1e0b2f98>for p in ps:print(p)# <Element p at 0x27cb8cb70e8> <Element p at 0x27cb8cb7138><Element p at 0x27cb8cb7188># 在整个html中找 b标签(pq对应的容器只有html一个标签)print(pq('b'))# <b>段落1</b><b>加粗</b># 在所有 p 中找 b标签(ps对应的容器是整个页面的p标签)print(ps('b'))# <b>段落1</b># print(ps[0]('b')) # 报错,因为ps[0] 不是pyquery对象print(PyQuery(ps[0])('b'))# <b>段落1</b>
a_list = pq('a')
p_list = pq('p')print(a_list)# 3.获取标签内容# PyQuery对象.text()
result = pq('h1').text()print(result)
result = pq('a').text()print(result,type(result))# 这是超链接1 这是超链接2 <class 'str'># 单独获取每个a标签的内容for a in pq('a'):print(PyQuery(a).text())# 这是超链接1# 这是超链接2# 4.获取标签属性# PyQuery 对象.attr(属性名)
result = pq('a').attr('href')print(result)# http://www.baidu.com# 获取每个a标签的href属性for a in pq('a'):print(PyQuery(a).attr('href'))# http://www.baidu.com# http://www.sohu.com
03-xpath
"""
Time: 2021/5/27 上午11:52
Author: 酒坛坛儿
Good Good Study, Day Day up !
"""from lxml import etree
# 1.构建指定的树结构并且获取根节点
html = etree.HTML(open('test.html', encoding='utf-8').read())# 2.根据路径获取节点(标签)# 节点对象.xpath(路径) - 获取指定路径对应的所有节点对象对应列表
a_list = html.xpath('/html/body/div/a')print(a_list)# 3.路径# 1)绝对路径: /绝对路径# 注意:绝对路径必须从树结构的根节点开始写(html树结构就是从html开始写)
result1 = html.xpath('/html/body/h1')
result2 = a_list[0].xpath('/html/body/h1')print(result1)# [<Element h1 at 0x10e725600>]print(result2)# [<Element h1 at 0x10e725600>]
result3 = html.xpath('/body/h1')# 绝对路径只能从html开始print(result3)# []# 2)相对路径# . - 表示当前节点(谁去点的xpath当前节点就是谁), ./可以省略# .. - 标签当前节点的上层节点
result4 = html.xpath('./body/h1')print(result4)# [<Element h1 at 0x104de7700>]
div = html.xpath('/html/body/div')[0]print(div)# <Element div at 0x101771180>
img1 = div.xpath('/html/body/div/img')print(img1)# [<Element img at 0x10d5cd500>]
img2 = div.xpath('./img')print(img2)# [<Element img at 0x10d5cd500>]
result5 = div.xpath('../b')print(result5)# [<Element b at 0x108cc8480>]# 补充:路径的最后加'/text()' 是用来获取标签内容
p = html.xpath('./body/p/text()')print(p)# ['我是段落1', '我是段落2']# 3)// - 全文检索# //p - 获取整个页面中所有的p标签对应的节点# //div/p - 获取整个页面中在div下面的p标签的
result6 = html.xpath('//p')print(result6)# [<Element p at 0x105ac1740>, <Element p at 0x105ac1780>, <Element p at 0x105ac17c0>]
result7 = div.xpath('//p')print(result7)# [<Element p at 0x105ac1740>, <Element p at 0x105ac1780>, <Element p at 0x105ac17c0>]
result8 = html.xpath('/html/body/div//p/text()')print(result8)
result9 = div.xpath('.//p/text()')print(result9)# 3.获取内容和属性# 获取内容 - 在路径的最后添加 /text()# 获取属性 - 在路径的最后添加 /@属性名
result10 = html.xpath('//a/text()')print(result10)# ['我是超链接1', '京东', '淘宝']
result11 = html.xpath('//a/@href')print(result11)# ['https://www.baidu.com', 'https://www.jd.com', 'https://www.jd.com']print(html.xpath('//@id'))# ['id1', 'id2', 'id3']# 4.谓语 - 筛选条件# 1)[N] - 获取第N个指定标签
result = div.xpath('./a[1]/text()')print(result)
result = html.xpath('./body/div/p/text()')print(result)# ['我是段落3', '我是段落5', '我是段落11', '我是段落22']
result = html.xpath('./body/div/p[1]/text()')print(result)# ['我是段落3', '我是段落11']
result = html.xpath('./body/div[2]/p/text()')print(result)# ['我是段落11', '我是段落22']
result = html.xpath('./body/div[2]/p[1]/text()')print(result)# ['我是段落11']# [last()] - 最后一个# [last()-1] - 倒数第2个# [last()-N] - 倒数第N+1个
result = html.xpath('./body/div/p[last()]/text()')print(result)# ['我是段落5', '我是段落22']# [position()<3] - 前两个# [position()<N] - 前N-1个
result = html.xpath('./body/div[2]/p[position()<3]/text()')print(result)# ['我是段落11', '我是段落22']
result = html.xpath('./body/div[2]/p[position()>=3]/text()')print(result)# ['我是段落33', '我是段落44', '我是段落55']# [@属性名] - 拥有指定属性
result = html.xpath('./body/div[2]/p[@class]/text()')print(result)# ['我是段落22', '我是段落44', '我是段落55']# [@属性名=值] - 指定属性是指定值的标签
result = html.xpath('./body/div[2]/p[@class="c1"]/text()')print(result)# ['我是段落22', '我是段落44']# [子标签名>值]、[子标签名>=值]、[子标签名<值]、[子标签名<=值]、[子标签名=值]# 按照子标签内容进行筛选
result = html.xpath('./body/div[last()]/li[span=150]/p/text()')print(result)# 5. 通配符 - *# *表示任意节点任意属性
result = html.xpath('./body/div[1]/*')print(result)
result = html.xpath('./body/*/*')print(result)
result = html.xpath('//img/@*')print(result)# 6.| - 分支
result = html.xpath('./body/div[1]/p/text()|./body/div[1]/a/text()')print(result)# ['我是段落3', '京东', '淘宝', '爱奇艺', '我是段落5']
div = html.xpath('./body/div[1]')[0]print(div.xpath('./p/text()|./a/text()'))# ['我是段落3', '京东', '淘宝', '爱奇艺', '我是段落5']# result = html.xpath('./body/div[1]/(p|a)/text()')# print(result)# result = html.xpath('./body/div[last()]/li/(span|p)/text()')# print(result)
04-selenium
from selenium import webdriver
# 创建谷歌浏览器
b=webdriver.Chrome()
b.get('http://www.jd.com')print(b.page_source)
05-作业-top250图片下载
from lxml import etree
import requests
html = etree.HTML(open('top250.html',encoding='utf-8').read())
img_list = html.xpath('/html/body//img/@src')print(img_list)# title_list =html.xpath('/html/body//div[@class="hd"]/a/span[@class="title"]/text()')# print(title_list)# //span[@class="title"]for index inrange(len(img_list[:-1])):
response1 = requests.get(img_list[index])withopen(f'./img/{index}.jpg','wb')as f:
f.write(response1.content)