day23
总结
-
# Pyquery from pyquery import PyQuery # 1. 创建PyQuery对象 # PyQuery对象的本质是一个容器 # pd对应的容器中只有一个html标签 # pq = PyQuery('需要解析的hml字符串') with open(r'./temp.html', 'r', encoding='utf-8') as f: html = f.read() pq = PyQuery(html) print(type(pq)) # <class 'pyquery.pyquery.PyQuery'> # 2. 通过选择器获取子标签 # PyQuery对象(选择器) p_pq = pq('p') print(type(p_pq)) # <class 'pyquery.pyquery.PyQuery'> print(p_pq) a_pq = pq('a') print(type(a_pq)) # <class 'pyquery.pyquery.PyQuery'> # print(a_pq) div_pq = pq('div') # print(div_pq) print(type(div_pq[0])) # <class 'lxml.html.HtmlElement'> li = pq('#listAll > div.layout-Module-container.layout-Cover.ListContent > ul > li') title = li('div > a > div.DyListCover-content > div:nth-child(1) > h3') # 3. 获取标签中的内容 # PyQuery对象.text() print(title.text()) # 4. 获取标签属性 # PyQuery对象.attr(属性名) result = li.attr('class') print(result) # layout-Cover-item 获取第一个li标签的class属性 print('='*40) # 获取所有li标签的class属性 for l in li: print(PyQuery(l).attr('class'))
-
# lxml操作 from lxml import etree # 1. 构建指定的数结构并且获取根节点 html = etree.HTML(open('temp.html', encoding='utf-8').read()) # 2. 根据路径获取根节点的路径 res = html.xpath('/html/body') print(res) # [<Element body at 0x216b0079500>] # 3. 路径 # 绝对路径: /绝对路径 # 注意:绝对路径必须从树结构的根节点开始写(html树结构就是从html开始写) res = html.xpath('/html/head') print(res) # [<Element head at 0x17b37a98780>] # 相对路径:./相对路径 # 注意:相对路径从该节点开始,. - 表示当前节点, .. - 表示父节点 meta = res[0].xpath('./meta') print(meta) # 4. 获取文本值/text() # 例如: # span_texts = html.xpath('//span/text()') # 5. 从指定路径下任意位置开始查找 span = html.xpath('//span') print(span) for s in span: print(s.xpath('./text()')) # 6. 获取属性 # 获取属性 - @属性名 for s in span: print(s.xpath('./@class')) ids = html.xpath('//@id') print(ids) # 7 谓语 - 筛选条件 # 1) [N] - 第N个 result = html.xpath('/html/body//a[1]/@href') print(result) # 2) [last()] - 最后一个 # [last() - N] - 倒数第N+1个 # 3) [position()<3] - 前两个[1, 3) # [position()<N] - 前N-1个 [1, N) # [position()>N] - [N, ∞) # 8. [@属性名] - 拥有固定属性 result = html.xpath('//a[@class]') print(result) # 9. [@属性名=值] - 指定固定属性的值条件可以是=, 也可以是>=, <=, >, < # 10. * - 通配符,可以匹配到任何标签,任意属性 res = html.xpath('./body/*') print(res) attrs = html.xpath('//a/@*') print(attrs) # 11. | - 或者 res = html.xpath('./body/*|./body/div/span') res = html.xpath('./body/*/div/text()|./body/div/a/text()')
作业
# 爬斗鱼
import requests
from bs4 import BeautifulSoup
import json
import csv
import selenium
from selenium.webdriver import Chrome
def get_net_data(_url, _headers={}, _params={}):
try:
res = requests.get(_url, headers=_headers, params=_params)
if res.status_code == 200:
return res.text
print(res)
finally:
res.close()
def get_json_data(_url, _headers={}, _params={}):
try:
res = requests.get(_url, headers=_headers, params=_params)
if res.status_code == 200:
return res.json()
print(res)
finally:
res.close()
def analysis_data(html: str):
soup = BeautifulSoup(html, 'lxml')
print(type(soup))
lis = soup.select('#listAll > div.layout-Module-container.layout-Cover.ListContent > ul > li')
for li in lis:
print(type(li))
title = li.select_one('div > a > div.DyListCover-content > div:nth-child(1) > h3')
print('title', title.get_text())
anchor = li.select_one('div > a > div.DyListCover-content > div:nth-child(2) > h2 > div')
print('anchor', anchor.get_text())
hot = li.select_one('div > a > div.DyListCover-content > div:nth-child(2) > span > svg')
print('hot', hot)
tag = li.select_one('div > a > div.DyListCover-content > span')
print('tag', tag)
if __name__ == '__main__':
# url = 'https://www.douyu.com/g_LOL'
# all_list_data = []
# next_url = 'https://www.douyu.com/gapi/rkc/directory/mixList/2_1/1'
# hearders = {
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36'
# }
# for x in range(1, 11):
# next_url = f'https://www.douyu.com/gapi/rkc/directory/mixList/2_1/{x}'
# py_obj = get_json_data(next_url, hearders)
# # print(py_obj['data']['rl'])
# all_list_data.extend(py_obj['data']['rl'])
#
# with open(r'./lol.json', 'w', encoding='utf-8') as f:
# f.write(json.dumps(all_list_data))
# analysis_data(html)
with open(r'./lol.json', 'r', encoding='utf-8') as f:
py_obj = json.loads(f.read())
origin = 'https://www.douyu.com'
new_list = []
for obj in py_obj:
one_data = [obj['rn'], obj['nn'], obj['ol'], f"{origin}{obj['url']}", obj['rs1'], obj['c2name']]
new_list.append(one_data)
with open(r'./lol.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['anchor', 'tag', 'hot', 'url', 'img', 'game'])
writer.writerows(new_list)
# 爬豆瓣
import requests
from lxml import etree
import re
import csv
# 'https://movie.douban.com/top250?start=0&filter='
class CrawpDoubanTop250():
def __init__(self, headers={}, params={}):
self.url = 'https://movie.douban.com/top250?start=%s&filter='
self.headers = headers
self.params = params
def get_urls(self, page):
return self.url % page
def parse_url(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return None
def get_content(self, page):
url = self.get_urls(page)
content = self.parse_url(url)
return content
def analysis_data(self, content):
html = etree.HTML(content)
lis = html.xpath('/html/body/div[3]/div[1]/div/div[1]/ol/li')
all_info = []
# 观察网页发现部分电影有些数据没有,列表不会存为空值,列表中数据不能对应,所以抛弃
# res = li.xpath('./div/div[2]/div[1]/a/span[1]/text()|./div/div[2]/div[1]/a/span[2]/text()|./div/div[2]/div[1]/a/span[3]/text()|./div/div[1]/a/img/@src|./div/div[2]/div[1]/a/@href|./div/div[2]/div[2]/p[1]/text()|./div/div[2]/div[2]/div/*/text()|./div/div[2]/div[2]/p[2]/span/text()')
for li in lis:
list_res = []
# res = li.xpath('./div/div[2]/div[1]/a/span[1]/text()|./div/div[2]/div[1]/a/span[2]/text()')
res = li.xpath('./div/div[2]/div[1]/a/span[position()<3]/text()')
list_res.extend(res)
# 由网页发现该数据有的电影没有,如果没有那么列表中获取不到,用户列表形式存为csv时,数据不能完全对应,因此需要单独提出来进行判断,如没有需要在列表中填充空字符串
res2 = li.xpath('./div/div[2]/div[1]/a/span[3]/text()')
list_res.extend(res2 if res2 else [''])
res3 = li.xpath('./div/div[1]/a/img/@src|./div/div[2]/div[1]/a/@href|./div/div[2]/div[2]/p[1]/text()|./div/div[2]/div[2]/div/*/text()')
list_res.extend(res3)
# 同res2情况
res4 = li.xpath('./div/div[2]/div[2]/p[2]/span/text()')
list_res.extend(res4 if res4 else [''])
new_res = []
for item in list_res:
item = re.sub(r'(^\s*/?)?\s+', ' ', item).strip()
new_res.append(item)
all_info.append(new_res)
return all_info
if __name__ == '__main__':
all_data = []
top25 = CrawpDoubanTop250()
for x in range(0, 226, 25):
data = top25.get_content(x)
all_data.extend(top25.analysis_data(data))
with open(r'./top25.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['title', 'other_title', 'other', 'img', 'url', 'director', 'movie_info', 'grade', 'number', 'subhead'])
writer.writerows(all_data)