一、爬取新闻联播视频
1. 将视频拖到最后
2. 分析视频缓存链接
如图所示url:https://hls.cntv.baishancdnx.cn/asp/hls/1200/0303000a/3/default/46c6c76d679340d5bb1df3a87573c952/270.ts
3. 将url对应的二进制内容抓取下来
代码如下:
import requests
import os
if __name__=="__main__":
for i in range(270):
url = "https://hls.cntv.baishancdnx.cn/asp/hls/1200/0303000a/3/default/46c6c76d679340d5bb1df3a87573c952/"+str(i+1)+".mp4"
root = "videos/0926/"
path = root +str(i+1) +".mp4" #抓取文件起的名字
try:
if not os.path.exists(root):
os.mkdir(root) #如果该目录不存在就创建它
if not os.path.exists(path):
r = requests.get(url) #获取到目标视频的所有信息
with open(path, 'wb') as f: #以二进制写的方式将r的二进制内容写入path
f.write(r.content)
f.close()
print("文件"+path+" 保存成功!")
else:
print("文件已存在")
except:
print("爬取失败")
爬取新闻文稿
代码如下:
import requests
from urllib.request import urlopen#用于获取网页
from bs4 import BeautifulSoup
def getUrls():
url = "http://www.sdpp.com.cn/list/list_98.html"
html=urlopen(url)
bs = BeautifulSoup(html,"html.parser")
urls = bs.find('aside',class_="news_list").find_all("a")
url_list=[]
for url in urls:
item=url.get('href')
url_list.append(item)
print(url_list)
return url_list
if __name__=="__main__":
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
}
url_list=getUrls()
for url in url_list:
filename="texts/newsText"+url[-9:-5]+".txt"
r = requests.get(url, headers=headers)
r.encoding = 'urf8'
bs = BeautifulSoup(r.text, "html.parser")
allpagecount = int(bs.find("span", {"id": "allpagecount"}).get_text())
title = bs.find("div", {"class": "keys3"}).get_text()
temp = title + "\n"
for i in range(1, allpagecount):
link = bs.find("a", {"id": "nextpageurl"})["href"]
r = requests.get(link, headers=headers)
r.encoding = 'urf8'
bs = BeautifulSoup(r.text, "html.parser")
maintext = bs.find("div", {"class": "textCon"}).get_text()
temp = temp + maintext + "\n"
print(maintext)
with open(filename, "w", encoding="utf8") as f:
f.write(temp)