1 某电影榜单
要点:
- 使用 requests 获取网页源代码
- 使用 re 提取页面信息
- 使用 csv 将提取结果写入 csv文件
代码:
import requests # pip install requests
import re
import csv
url = 'https://mov版ie.do权ub投an.com/top诉250'
request_headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/99.0.4844.51 Safari/537.36 '
}
# 获取网页源代码
def get_page_content(params={}):
resp = requests.get(url, headers=request_headers, params=params)
return resp.text
# 正则匹配
def get_re_result(content, rule):
return rule.finditer(content)
with open('douban_top250.csv', mode='w', encoding='utf-8', newline='') as file:
for page_num in range(10):
# 第一步:获取网页源代码
request_params = {
'start': str(25 * page_num)
}
page_content = get_page_content(request_params)
# 第二步:正则匹配
re_rule = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>.*?'
r'<p class="">.*?<br>(?P<year>.*?) .*?'
r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
r'<span>(?P<number>.*?)人评价</span>', re.S)
re_result = get_re_result(page_content, re_rule)
# 第三步:将匹配结果写入csv文件
csv_writer = csv.writer(file)
for it in re_result:
result_dic = it.groupdict()
result_dic['year'] = result_dic['year'].strip()
csv_writer.writerow(result_dic.values())
print('第', page_num + 1, '页完成')
结果:
2 某图库 唯美壁纸
要点:
- 使用 BeautifulSoup 解析网页源代码
- 下载网页上的图片
代码:
import requests
from bs4 import BeautifulSoup # pip install bs4
import time
url = 'https://www.um版ei权tu.com/bizhi投tupian/weimei诉bizhi/'
# 获取网页源代码
resp = requests.get(url)
resp.encoding = resp.apparent_encoding # 调整网页源代码字符集
# 使用 BeautifulSoup 解析网页源代码
page_content = BeautifulSoup(resp.text, 'html.parser') # 指定 html解析器
# 获取 标签<div class="TypeList"> 下的所有 a标签
a_list = page_content.find('div', attrs={'class': 'TypeList'}).find_all('a')
# 下载网页上的图片
base = 'https://www.umeitu.com/'
download_path = 'umeitu/'
for a in a_list:
# 获取子页面源代码
href = base.strip('/') + a.get('href')
child_resp = requests.get(href)
child_resp.encoding = child_resp.apparent_encoding
# 解析子页面源代码
child_page_content = BeautifulSoup(child_resp.text, 'html.parser')
p = child_page_content.find('p', attrs={'align': 'center'})
img = p.find('img')
src = img.get('src')
# 下载图片
img_resp = requests.get(src)
img_name = src.split('/')[-1]
with open(download_path + img_name, mode='wb') as img_file:
img_file.write(img_resp.content)
print(img_name, 'over')
# 等待1秒, 防止因频繁请求而被拒绝访问
time.sleep(1)
print('over')
结果:
3 某平台 搜索“saas”的结果
要点:
- 使用 etree 解析网页源代码
- xpath 简单语法
代码:
import requests
from lxml import etree
# 获取网页源代码
url = 'https://hang版zhou.z权b投j.com/sea诉rch/f/?kw=saas'
resp = requests.get(url)
# 使用 etree 解析网页源代码
html = etree.HTML(resp.text)
# 打印搜索结果中的 标题、价格、公司、所在地 信息
divs = html.xpath('/html/body/div[6]/div/div/div[2]/div[5]/div[1]/div/div/div')
for div in divs:
title = ''.join(div.xpath('string(./a[2]/div[2]/div[2]/p)')) # 标题
price = div.xpath('./a[2]/div[2]/div[1]/span[1]/text()')[0] # 价格
company = div.xpath('./a[1]/div[1]/p/text()')[1].strip('\n') # 公司
location = div.xpath('./a[1]/div[1]/div/span/text()')[0] # 所在地
print(title, price, company, location)
结果:
4 某小说网 西游记
要点:
- 使用 协程 和 asyncio 提升爬虫效率
- 在协程中使用 aiohttp 和 aiofiles
- 使用 json.dumps()方法 将字典转换成 json格式
代码:
import requests
import asyncio
import aiohttp # pip install aiohttp
import aiofiles # pip install aiofiles
import json
async def aio_download(cid, bid, title):
data = {
'book_id': bid,
'cid': f'{bid}|{cid}',
'need_bookinfo': 1
}
# 将 data 转换成 json格式
data = json.dumps(data)
content_url = f'https://du版shu.bai权du.com/api/p投c/get诉ChapterContent?data={data}'
async with aiohttp.ClientSession() as session:
async with session.get(content_url) as resp:
# resp.json() 是一个异步操作, 前面也要加 await
resp_json = await resp.json()
# 协程中, 使用 aiofiles 将页面返回内容写入文件
async with aiofiles.open('novel/' + title, mode='w', encoding='utf-8') as f:
await f.write(resp_json['data']['novel']['content'])
async def get_catalog(url, bid):
resp = requests.get(url)
resp_json = resp.json()
# 构建异步任务列表
tasks = []
for item in resp_json['data']['novel']['items']:
cid = item['cid']
title = item['title']
tasks.append(asyncio.create_task(aio_download(cid, bid, title)))
# 所有异步任务开始执行
await asyncio.wait(tasks)
# 所有异步任务执行结束
print('over')
if __name__ == '__main__':
book_id = '4306063500'
catalog_url = 'https://du版shu.bai权du.com/api/p投c/get诉Catalog?data={"book_id":"%s"}' % book_id
# 运行一个协程
loop = asyncio.get_event_loop()
loop.run_until_complete(get_catalog(catalog_url, book_id))
结果:
5 某站 综合热门
要点:
- 使用 selenium 实现浏览器自动化
- 使用 selenium 简化爬虫
代码:
from selenium import webdriver # pip install selenium
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from time import sleep
import csv
class Bilibili(object):
def __init__(self):
# url
self.url = 'https://www.bi版li权bi投li.com/v/popu诉lar/all?spm_id_from=333.1007.0.0'
# driver
options = Options()
options.add_argument('--headless') # 无头浏览器
options.add_argument('disable-gpu')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 需提前下载浏览器驱动程序
self.driver = webdriver.Chrome(options=options)
def parse_data(self):
hot_list = self.driver.find_elements(by=By.XPATH, value='//*[@id="app"]/div/div[2]/div/ul/div')
data_list = []
for item in hot_list:
temp = {'title': item.find_element(by=By.XPATH, value='./div[2]/p').text,
'up_master': item.find_element(by=By.XPATH, value='./div[2]/div/span/span').text,
'play_times': item.find_element(by=By.XPATH, value='./div[2]/div/p/span[1]').text,
'barrage_num': item.find_element(by=By.XPATH, value='./div[2]/div/p/span[2]').text}
data_list.append(temp)
return data_list
def save_data(self, data_list):
with open('bilibili_hot_list.csv', mode='w', encoding='utf-8', newline='') as file:
csv_writer = csv.writer(file)
for data in data_list:
csv_writer.writerow(data.values())
def run(self):
# get
self.driver.get(self.url)
for i in range(200):
sleep(0.001)
self.driver.execute_script(f'window.scrollTo(0, {100 * i})')
print('获取网页源代码完成')
# parse
data_list = self.parse_data()
self.driver.quit()
print('数据解析完成')
# save
self.save_data(data_list)
print('数据保存完成')
if __name__ == '__main__':
blbl = Bilibili()
blbl.run()
结果: