python 简单的网页爬虫与测试
流程
boost.py
import time
import requests
import random
from lxml import etree
import json
def getonepage(n):
url=f'https://zhuanlan.zhihu.com/p/{n}?utm_source=qq&utm_medium=social' #要以http开头 f引导 {}占位符
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'} #请求头->f12 找到浏览器的信息 网罗 f5刷新 在name中点一个#右侧会显示信息 找到请求头 useragent
r=requests.get(url,headers=header)#调用get()函数 #print(r)#状态码
return r.text
def parse(i,text):
#初始化 标准化
html=etree.HTML(text)
#提取我们想要的信息 需要xpath语法
names=html.xpath('//h1[@class="Post-Title"]/text()')
if names==[]:
pass
else :
print(str(i))
print(names)
save2file(''+str(i))
save2file(names)
#文件保存
def save2file(data):
global j
with open('chengxu//'+str(j)+'.txt','a',encoding='utf-8')as f:
#把字典 列表 转化成字符串且不以ascii方式编码 字符串才能写入文件
#反斜杠n 换行
data1=json.dumps(data,ensure_ascii=False)+'\n'
f.write(data1)
with open('chengxu//test.txt','r',encoding='utf-8')as f1:
global j
global jk
source = f1.read()
print(source)
j=source
print(j)
print(int(j,10)+1)
jk=int(j,10)
abc=int(j,10)+1000
with open('chengxu//test.txt','w',encoding='utf-8')as f2:
f2.write(str(abc))
#j=50005000
for i in range(jk,jk+1000):
if (i%10)==1:
print(i)
text=getonepage(i)
items=parse(i,text)
cease=random.randint(0,9)
time.sleep(0.1*cease)
time.sleep(10000)
#10251300-10267550
#40251300-40259900
#40269900-40279900
#40279900-40289900
#50000000-50001000
#50001000,50002000
xpath语法:
#//url 选取所有子元素
#//url/url
#在XPath中匹配的是unicode编码的不间断空格符( ),
#所以将空格全部替换为’\xa0‘即可。
在线测试网站(在线运行由于缺少模块一般仅能实现简单程序)
https://c.runoob.com/compile/9
python2.7:https://www.tutorialspoint.com/execute_python_online.php
视频文件处理
'''
import requests
#https://blog.csdn.net/msspark/article/details/86745391 文件的读写操作
file = open("index.txt","r")#打开文件
zifu = file.read()#读取所有文件内容
print(zifu)
file.close()#关闭文件
fengede=zifu.split('#EXTINF:')
for i in range(2,len(fengede)):
fengede[i]=fengede[i].strip()
fengede[i]=fengede[i][10:26]
#fengede=fengede.
print(fengede)
#print(fengede[2][10:26])
for i in range(1,len(fengede)):
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
#pd_url = 'https://ali-video.acfun.cn/mediacloud/acfun/acfun_video/segment/'+fengede[i]
pd_url = 'https://bobo.okokbo.com/20180303/zSx23T4P/800kb/hls/'+fengede[i]
res = requests.get(pd_url,headers=header)
#with open('download\hybk'+str(i)+'.ts', 'ab+') as f:
with open('bgyx.mp4', 'ab+') as f:
f.write(res.content)
f.flush()
print(i ,len(fengede))
'''
import requests
#import time
for i in range(10,99):
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
#pd_url = 'https://bobo.okokbo.com/20180303/zSx23T4P/800kb/hls/'+fengede[i]
ccc=str(i)
pd_url = 'https://youku.com-t-youku.com/20190318/9338_484dd451/1000k/hls/0fd10adc6ba0000'+ccc+'.ts'
res = requests.get(pd_url,headers=header)
#with open('download\filename'+str(i)+'.ts', 'ab+') as f:
with open('mkxdcp2.ts', 'ab+') as f:
f.write(res.content)
f.flush()
print(i)
#time.sleep(1)