python爬虫与测试+视频文件处理

最新推荐文章于 2022-07-15 12:52:38 发布

FakeOccupational

最新推荐文章于 2022-07-15 12:52:38 发布

阅读量204

点赞数

分类专栏：数据分析

本文链接：https://blog.csdn.net/ResumeProject/article/details/114193070

版权

数据分析专栏收录该内容

17 篇文章 0 订阅

订阅专栏

python 简单的网页爬虫与测试

流程

在这里插入图片描述

boost.py

import time
import requests
import random


from lxml import etree
import json
def getonepage(n):
    url=f'https://zhuanlan.zhihu.com/p/{n}?utm_source=qq&utm_medium=social' #要以http开头 f引导 {}占位符
    header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'} #请求头->f12 找到浏览器的信息   网罗 f5刷新  在name中点一个#右侧会显示信息  找到请求头  useragent
    r=requests.get(url,headers=header)#调用get()函数  #print(r)#状态码 
    return r.text

def parse(i,text):
    #初始化 标准化
    html=etree.HTML(text)
    #提取我们想要的信息 需要xpath语法
    names=html.xpath('//h1[@class="Post-Title"]/text()')
    if names==[]:
        pass
    else :
        print(str(i))
        print(names)
        save2file(''+str(i))
        save2file(names)
  


#文件保存
def save2file(data):
    global j
    with open('chengxu//'+str(j)+'.txt','a',encoding='utf-8')as f:
        #把字典 列表 转化成字符串且不以ascii方式编码   字符串才能写入文件
        #反斜杠n 换行
        data1=json.dumps(data,ensure_ascii=False)+'\n'
        f.write(data1)

with open('chengxu//test.txt','r',encoding='utf-8')as f1:
    global j
    global jk
    source = f1.read()
    print(source)
    j=source
    print(j)
    print(int(j,10)+1)
    jk=int(j,10)
    abc=int(j,10)+1000
    with open('chengxu//test.txt','w',encoding='utf-8')as f2:
          f2.write(str(abc))

#j=50005000
for i in range(jk,jk+1000):
    if (i%10)==1:
        print(i)
    text=getonepage(i)
    items=parse(i,text)
    cease=random.randint(0,9)
    time.sleep(0.1*cease)
time.sleep(10000)
#10251300-10267550
#40251300-40259900
#40269900-40279900
#40279900-40289900
#50000000-50001000
#50001000,50002000

xpath语法：
    #//url 选取所有子元素
    #//url/url
    #在XPath中匹配的是unicode编码的不间断空格符（&nbsp;），
    #所以将空格全部替换为’\xa0‘即可。

在线测试网站(在线运行由于缺少模块一般仅能实现简单程序)

https://c.runoob.com/compile/9

https://coding.net/：需要注册

python2.7：https://www.tutorialspoint.com/execute_python_online.php

视频文件处理

'''
import requests
#https://blog.csdn.net/msspark/article/details/86745391 文件的读写操作
file = open("index.txt","r")#打开文件
zifu = file.read()#读取所有文件内容
print(zifu)
file.close()#关闭文件
fengede=zifu.split('#EXTINF:')
for i in range(2,len(fengede)):
    fengede[i]=fengede[i].strip()
    fengede[i]=fengede[i][10:26]
    
      
#fengede=fengede.
print(fengede)
#print(fengede[2][10:26])


for i in range(1,len(fengede)):
    header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
    #pd_url = 'https://ali-video.acfun.cn/mediacloud/acfun/acfun_video/segment/'+fengede[i]
    pd_url = 'https://bobo.okokbo.com/20180303/zSx23T4P/800kb/hls/'+fengede[i]

    res = requests.get(pd_url,headers=header)
    #with open('download\hybk'+str(i)+'.ts', 'ab+') as f:
    with open('bgyx.mp4', 'ab+') as f:
                f.write(res.content)
                f.flush()
                print(i ,len(fengede))

'''
import requests
#import time

for i in range(10,99):
    header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
    #pd_url = 'https://bobo.okokbo.com/20180303/zSx23T4P/800kb/hls/'+fengede[i]
    ccc=str(i)
    pd_url = 'https://youku.com-t-youku.com/20190318/9338_484dd451/1000k/hls/0fd10adc6ba0000'+ccc+'.ts'
    res = requests.get(pd_url,headers=header)
    #with open('download\filename'+str(i)+'.ts', 'ab+') as f:
    with open('mkxdcp2.ts', 'ab+') as f:
                f.write(res.content)
                f.flush()
                print(i)
                #time.sleep(1)