保存视频:
with open("a.mp4",mode= "wb" ) as f:
f.write(requests.get(srcurl).content)
案例:
# 1.拿到cont id
# 2.拿到videoStatus 返回的json.--> srcURL
# 3. srcURL 里面的内容进行修整
# 4.进行下载视频
# 防盗链 :Referer: https://www.pearvideo.com/video_1670880
# 防盗链 : 溯源 ,当前本次请求的上一级是谁
import requests
url = 'https://www.pearvideo.com/video_1728717'
countID = url.strip("_")[-1]
videoStatusurl = f"https://www.pearvideo.com/videoStatus.jsp?contId={countID}&mrd=0.9985367825928784"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'Referer': url,
'Cookie': '__secdyid=27de06ad0a8da9a14f3b540e41c7cb467509bf8414ad2e94021620393157; JSESSIONID=871D99A140EF47CFDA8A301EFB7FA1D1; PEAR_UUID=124a7179-60ae-4f9b-b9c5-b795a6ada8bf; _uab_collina=162039315798804155909594; Hm_lvt_9707bc8d5f6bba210e7218b8496f076a=1620393158; UM_distinctid=17946f566d5a-0047bbc1afb132-d7e1739-1fa400-17946f566d671; p_h5_u=DD60EA33-AA2F-4802-B2B5-22E0F0483FBC; CNZZDATA1260553744=805876215-1620391095-https%253A%252F%252Fwww.baidu.com%252F%7C1620395494; acw_tc=781bad3616203997211672753e0c82bd49d4cbac44c3f57bc3b78bffa91b7a; Hm_lpvt_9707bc8d5f6bba210e7218b8496f076a=1620399723; SERVERID=bacac21aafa9027952fdc46518c0c74f|1620399782|1620393157'
}
resp = requests.get(headers=headers,url=videoStatusurl)
# resp.encoding = 'utf-8'
dic =resp.json()
# Set-Cookie: SERVERID=bacac21aafa9027952fdc46518c0c74f|1620399782|1620393157;Path=/
#
srcurl = dic['videoInfo']['videos']['srcUrl']
systemTime = dic['systemTime']
srcurl =srcurl.replace(systemTime,f"cont-{countID}")
# print(srcurl)
# <img class="img" src="https://image.pearvideo.com/cont/20200424/cont-1670880-12367380.png" alt="独腿画家流浪36年:生活磨难让我学会感恩,劳动维生不卑微">
# <video webkit-playsinline="" playsinline="" x-webkit-airplay="" style="width: 100%; height: 100%;" src="https://video.pearvideo.com/mp4/adshort/20210506/cont-1670880-4635-160356_adpkg-ad_hd.mp4"></video>
# 保存视频
#
with open("a.mp4",mode= "wb" ) as f:
f.write(requests.get(srcurl).content)
# print(resp.json())
print(dic)
保存图片:
# #图片的下载
# img_a = requests.get(src)
# # img_a.content # 这里拿到的是字节
# img_name = src.split("/")[-1] #拿到url 中最后一个/以后的内容
# with open("img/"+img_name ,mode= "wb" ) as f:
# # with open('相册', mode="wb") as f:
# f.write(img_a.content) #图片内容 写入文件
案例:
from bs4 import BeautifulSoup
import requests
import time
import csv
url = 'https://www.umei.net/bizhitupian/weimeibizhi/'
hearders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
# charset=utf-8"
resp = requests.get(url, headers=hearders)
resp.encoding = 'utf-8'
# print(resp.text)
rot = BeautifulSoup(resp.text, "html.parser")
table = rot.find("div", class_="TypeList").find_all("a") # 范围第一次缩小
# print(table)
for a in table:
href=a.get("href") # 直接通过 get 就可以拿到属性的值
# print(href)
# 那子页面的源代码
# print(href)
child = requests.get(href)
child.encoding = 'utf-8'
child_text = child.text
# 从子页面中拿到图片的下载路径
child_page = BeautifulSoup(child_text, 'html.parser')
p_img = child_page.find('p', align="center").find('img')
src = p_img.get("src")
print(src)
# # print(p_img.get("src"))
#
#图片的下载
img_a = requests.get(src)
# img_a.content # 这里拿到的是字节
img_name = src.split("/")[-1] #拿到url 中最后一个/以后的内容
with open("img/"+img_name ,mode= "wb" ) as f:
# with open('相册', mode="wb") as f:
f.write(img_a.content) #图片内容 写入文件
print('over!!!')
time.sleep(1)
f.close()
保存数据:
f = open("数据.csv" , mode='w' , encoding= 'utf-8 ')
csvwriter = csv.writer(f)
接着在你循环的部位 加:
csvwriter.writerow([span,p,com_name,location])
案例:csv 的保存
import requests
from lxml import etree
import csv
from concurrent.futures import ThreadPoolExecutor
#保存方式:
f = open("4_5新价.csv", mode="w", encoding='utf-8')
csvwriter = csv.writer(f)
def download_one_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit'
'/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
resp = requests.get(url, headers=headers)
resp.encoding = "utf-8"
# print(resp.text)
html = etree.HTML(resp.text)
# tbody = html.xpath('/html/body/div[2]/div[4]/div[1]/table/')[0]
tbody = html.xpath('/html/body/div[2]/div[4]/div[1]/table')[0]
# print(tbody)
tr = tbody.xpath('./tr[position()>1]')
for td in tr:
tds = td.xpath('./td/text()')
# 对数据进行简单的处理 写一个生成器
# print(td)
tds = (item.replace("\\", "").replace("/", "") for item in tds)
csvwriter.writerow(tds)
if __name__ == '__main__':
# for i in range(1,314940): #效率极其低下
# download_one_page(f"http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml")
with ThreadPoolExecutor(50) as t :
for i in range (1,200):
#r任务转手
t.submit(download_one_page ,f"http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml")
print('提取完毕!')