爬虫登录总结

保存视频:

with open("a.mp4",mode= "wb" ) as  f:
    f.write(requests.get(srcurl).content)

案例:

# 1.拿到cont  id
# 2.拿到videoStatus 返回的json.--> srcURL
# 3. srcURL 里面的内容进行修整
# 4.进行下载视频


# 防盗链 :Referer: https://www.pearvideo.com/video_1670880
# 防盗链 : 溯源 ,当前本次请求的上一级是谁


import requests


url = 'https://www.pearvideo.com/video_1728717'
countID = url.strip("_")[-1]

videoStatusurl = f"https://www.pearvideo.com/videoStatus.jsp?contId={countID}&mrd=0.9985367825928784"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
    'Referer': url,
    'Cookie': '__secdyid=27de06ad0a8da9a14f3b540e41c7cb467509bf8414ad2e94021620393157; JSESSIONID=871D99A140EF47CFDA8A301EFB7FA1D1; PEAR_UUID=124a7179-60ae-4f9b-b9c5-b795a6ada8bf; _uab_collina=162039315798804155909594; Hm_lvt_9707bc8d5f6bba210e7218b8496f076a=1620393158; UM_distinctid=17946f566d5a-0047bbc1afb132-d7e1739-1fa400-17946f566d671; p_h5_u=DD60EA33-AA2F-4802-B2B5-22E0F0483FBC; CNZZDATA1260553744=805876215-1620391095-https%253A%252F%252Fwww.baidu.com%252F%7C1620395494; acw_tc=781bad3616203997211672753e0c82bd49d4cbac44c3f57bc3b78bffa91b7a; Hm_lpvt_9707bc8d5f6bba210e7218b8496f076a=1620399723; SERVERID=bacac21aafa9027952fdc46518c0c74f|1620399782|1620393157'
}

resp = requests.get(headers=headers,url=videoStatusurl)
# resp.encoding = 'utf-8'
dic =resp.json()
# Set-Cookie: SERVERID=bacac21aafa9027952fdc46518c0c74f|1620399782|1620393157;Path=/
#
srcurl = dic['videoInfo']['videos']['srcUrl']
systemTime = dic['systemTime']
srcurl =srcurl.replace(systemTime,f"cont-{countID}")
# print(srcurl)
# <img class="img" src="https://image.pearvideo.com/cont/20200424/cont-1670880-12367380.png" alt="独腿画家流浪36年:生活磨难让我学会感恩,劳动维生不卑微">
# <video webkit-playsinline="" playsinline="" x-webkit-airplay="" style="width: 100%; height: 100%;" src="https://video.pearvideo.com/mp4/adshort/20210506/cont-1670880-4635-160356_adpkg-ad_hd.mp4"></video>
# 保存视频
#
with open("a.mp4",mode= "wb" ) as  f:
    f.write(requests.get(srcurl).content)

# print(resp.json())
print(dic)

 

 

 

保存图片:

#     #图片的下载
#     img_a = requests.get(src)
#     # img_a.content    # 这里拿到的是字节
#     img_name = src.split("/")[-1]  #拿到url 中最后一个/以后的内容
#     with open("img/"+img_name ,mode= "wb" ) as f:
#     # with open('相册', mode="wb") as f:
#         f.write(img_a.content) #图片内容 写入文件

案例:

from bs4 import BeautifulSoup
import requests
import time
import csv

url = 'https://www.umei.net/bizhitupian/weimeibizhi/'
hearders = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
# charset=utf-8"

resp = requests.get(url, headers=hearders)
resp.encoding = 'utf-8'
# print(resp.text)
rot = BeautifulSoup(resp.text, "html.parser")
table = rot.find("div", class_="TypeList").find_all("a")  # 范围第一次缩小
# print(table)
for a in table:
    href=a.get("href")  # 直接通过 get 就可以拿到属性的值
    # print(href)
    # 那子页面的源代码
    # print(href)
    child = requests.get(href)
    child.encoding = 'utf-8'
    child_text = child.text
    # 从子页面中拿到图片的下载路径
    child_page = BeautifulSoup(child_text, 'html.parser')
    p_img = child_page.find('p', align="center").find('img')
    src = p_img.get("src")
    print(src)
#     # print(p_img.get("src"))
#
    #图片的下载
    img_a = requests.get(src)
    # img_a.content    # 这里拿到的是字节
    img_name = src.split("/")[-1]  #拿到url 中最后一个/以后的内容
    with open("img/"+img_name ,mode= "wb" ) as f:
    # with open('相册', mode="wb") as f:
        f.write(img_a.content) #图片内容 写入文件

    print('over!!!')
    time.sleep(1)

f.close()

 

保存数据:

f = open("数据.csv" , mode='w' , encoding= 'utf-8 ')
csvwriter = csv.writer(f)

接着在你循环的部位 加:

csvwriter.writerow([span,p,com_name,location])

案例:csv 的保存

import requests
from lxml import etree
import csv
from  concurrent.futures import ThreadPoolExecutor


#保存方式:

f = open("4_5新价.csv", mode="w", encoding='utf-8')
csvwriter = csv.writer(f)


def download_one_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit'
                      '/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
    }
    resp = requests.get(url, headers=headers)
    resp.encoding = "utf-8"
    # print(resp.text)
    html = etree.HTML(resp.text)
    # tbody = html.xpath('/html/body/div[2]/div[4]/div[1]/table/')[0]
    tbody = html.xpath('/html/body/div[2]/div[4]/div[1]/table')[0]

    # print(tbody)
    tr = tbody.xpath('./tr[position()>1]')
    for td in tr:
        tds = td.xpath('./td/text()')
        # 对数据进行简单的处理  写一个生成器
        # print(td)
        tds = (item.replace("\\", "").replace("/", "") for item in tds)
        csvwriter.writerow(tds)



if __name__ == '__main__':
    # for i in range(1,314940):  #效率极其低下
        # download_one_page(f"http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml")



    with ThreadPoolExecutor(50) as t :
        for i in range (1,200):

            #r任务转手
            t.submit(download_one_page ,f"http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml")
print('提取完毕!')

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

itLaity

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值