1、 简单了解爬虫
import requests
url = "http://www.xinfadi.com.cn/priceDetail.html"
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"
}
resp = requests.get(url, headers=head)
print(resp.text)
resp.close()
2、 爬图片例子
import requests
from bs4 import BeautifulSoup
import time
url = "https://www.umei.cc/bizhitupian/weimeibizhi/"
resp = requests.get(url)
resp.encoding = 'utf-8'
main_page = BeautifulSoup(resp.text, "html.parser")
alist = main_page.find("div", class_="TypeList").find_all("a")
for a in alist:
href = a.get('href')
child_page_resp = requests.get(href)
child_page_resp.encoding = 'utf-8'
child_page_text = child_page_resp.text
child_page = BeautifulSoup(child_page_text, "html.parser")
p = child_page.find("p", align="center")
img = p.find("img")
src = img.get("src")
img_resp = requests.get(src)
img_name = src.split("/")[-1]
with open("img/"+img_name, mode="wb") as f:
f.write(img_resp.content)
f.close()
print("over!", img_name)
time.sleep(1)
3、线程池例子
import requests
from lxml import etree
import csv
from concurrent.futures import ThreadPoolExecutor
f = open("data.csv", mode="w", encoding="utf-8")
csvwriter = csv.writer(f)
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"
}
def download_one_page(url):
resp = requests.get(url, headers=head)
html = etree.HTML(resp.text)
table = html.xpath("/html/body/div[2]/div/div/div/div[4]/div[1]/div/table")[0]
trs = table.xpath("./tr[position()>1]")
for tr in trs:
txt = tr.xpath("./td/text()")
txt = (item.replace("\\", "").replace("/", "") for item in txt)
csvwriter.writerow(txt)
print(url, "提取完毕!")
resp.close()
if __name__ == '__main__':
with ThreadPoolExecutor(50) as t:
for i in range(1, 200):
t.submit(download_one_page, f"http://www.xinfadi.com.cn/{i}.html")
4、爬一部小说
import requests
import asyncio
import aiohttp
import aiofiles
import json
"""
步骤:
1. 同步操作:拿到所有章节
2. 异步操作: 下载所有文件内容
"""
async def aiodownload(cid, b_id, title):
data = {
"book_id": b_id,
"cid": f"{b_id}|{cid}",
"need_bookinfo": 1
}
data = json.dumps(data)
url = f"https://dushu.baidu.com/api/pc/getChapterContent?data={data}"
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
dic = await resp.json()
async with aiofiles.open(f'novel/{title}', mode="w", encoding="utf-8") as f:
await f.write(dic['data']['novel']['content'])
async def getCatalog(url):
resp = requests.get(url)
dic = resp.json()
tasks = []
for item in dic['data']['novel']['items']:
title = item['title']
cid = item['cid']
task = asyncio.create_task(aiodownload(cid, b_id, title))
tasks.append(task)
await asyncio.wait(tasks)
if __name__ == '__main__':
b_id = "4306063500"
url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + b_id + '"}'
asyncio.run(getCatalog(url))
5、简单了解爬视频
"""
流程:
1. 拿到54812-1-1.html的页面源代码
2. 从源代码中提取m3u8的url
3. 下载m3u8
4. 读取m3u8文件,下载视频
5. 合并视频
创建文件夹 标记为 excluded
"""
import requests
import re
n = 1
with open("哲仁王后.m3u8", mode="r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line.startswith("#"):
continue
resp3 = requests.get(line)
f = open(f"video/{n}.ts", mode="wb")
f.write(resp3.content)
f.close()
resp3.close()
n += 1
6、多线程版爬视频例子
"""
思路:
1. 拿到页面的源代码
2. 从iframe的页面源代码中拿到m3u8文件
3. 下载第一层m3u8文件 -> 下载第二层m3u8文件(视频的存放路径)
4. 下载视频
5. 下载密钥,进行解密
6. 合并所有ts文件作为mp4文件
"""
import requests
from bs4 import BeautifulSoup
import re
import asyncio
import aiohttp
import aiofiles
from Crypto.Cipher import AES
import os
def get_iframe_src(url):
resp = requests.get(url)
main_page = BeautifulSoup(resp.text, "html.parser")
src = main_page.find("iframe").get("src")
resp.close()
return src
def download_m3u8_file(url, name):
resp = requests.get(url)
with open(name, mode="wb") as f:
f.write(resp.text)
def get_first_m3u8_url(url):
resp = requests.get(url)
obj = re.compile(r'var main = "(?P<m3u8_url>.*?)"', re.S)
m3u8_url = obj.search(resp.text).group("m3u8_url")
resp.close()
return m3u8_url
async def download_ts(url, name, session):
async with session.get(url) as resp:
async with aiofiles.open(f'video/{name}', mode="wb") as f:
await f.write(await resp.content.read())
print(f'{name}下载完毕')
async def aio_download(up_url):
tasks = []
async with aiohttp.ClientSession() as session:
async with aiofiles.open("越狱第一季_second.txt", mode="r", encoding='utf-8') as f:
async for line in f:
if line.startswith("#"):
continue
line = line.strip()
ts_url = up_url+line
task = asyncio.create_task(download_ts(ts_url, line, session))
tasks.append(task)
await asyncio.wait(tasks)
def get_key(url):
resp = requests.get(url)
return resp.text
async def dec_ts(name, key):
aes = AES.new(key=key, IV=b"00000000000", mode=AES.MODE_CBC)
async with aiofiles.open(f'video/{name}', mode="rb") as f1,\
aiofiles.open(f'video/temp_{name}', mode="wb") as f2:
bs = await f1.read()
await f2.write(aes.decrypt(bs))
print(f"{name}处理完毕")
async def aio_dec(key):
tasks = []
async with aiofiles.open("越狱第一季_second.txt", mode="r", encoding='utf-8') as f:
async for line in f:
if line.startswith("#"):
continue
line = line.strip()
task = asyncio.create_task(dec_ts(line, key))
tasks.append(task)
await asyncio.wait(tasks)
def merge_ts():
lst = []
with open("越狱第一季_second.txt", mode="r", encoding="utf-8") as f:
for line in f:
if line.startswith("#"):
continue
line = line.strip()
lst.append(f"video/temp_{line}")
s = " ".join(lst)
os.system(f"cat {s} > movie.mp4")
print("完毕!")
def main(url):
iframe_src = get_iframe_src(url)
first_m3u8_url = get_first_m3u8_url(iframe_src)
iframe_domain = iframe_src.split("/share")[0]
first_m3u8_url = iframe_domain+first_m3u8_url
download_m3u8_file(first_m3u8_url, "越狱第一季.txt")
with open("越狱第一季.txt", mode="r", encoding="utf-8") as f:
for line in f:
if line.startswith("#"):
continue
else:
line = line.strip()
second_m3u8_url = first_m3u8_url.split("index.m3u8")[0]+line
download_m3u8_file(second_m3u8_url, "越狱第一季_second.txt")
second_m3u8_url_up = second_m3u8_url.replace("index.m3u8", "")
key_url = second_m3u8_url_up+"key.key"
key = get_key(key_url)
asyncio.run(aio_dec(key))
merge_ts()
if __name__ == '__main__':
url = "http://91kanju2.com/vod-play/541-2-1.html"
main(url)