import requests
from lxml import etree
from bs4 import BeautifulSoup
import json
class QiuShi(object):
#构造方法
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
self.base_url = "https://www.qiushibaike.com/8hr/page/{}"
#请求方法
def get_html_text(self, url):
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response.text
else:
return None
#解析列表页方法,返回详情页url
def parse_list_page(self, text):
html = etree.HTML(text)
urls = html.xpath("//a[@class='recmd-content']/@href")
urls = list(map(lambda u: "https://www.qiushibaike.com" + u, urls))
return urls
#解析详情页方法,返回抓取数据
def parse_detail_page(self, text):
soup = BeautifulSoup(text, "lxml")
item = {}
author = soup.find("span", attrs={"class": "side-user-name"}).string
content = soup.find("div", attrs={"class":"content"}).string
video = soup.find("video")
if video:
video_url = "https:" + soup.find("video").find("source").get("src")
else:
video_url = None
images = soup.find("div", attrs={"class":"thumb"})
if images:
img_urls = [i.get("src") for i in images.find_all("img")]
img_urls = list(map(lambda u: "http:" + u, img_urls))
else:
img_urls = None
like_num = soup.find("i", attrs={"class":"number"}).string
item["author"] = author
item["content"] = content
item["video_url"] = video_url
item["img_urls"] = img_urls
item["like_num"] = like_num
return item
#保存数据方法
def save_imgAndVideo(self, item):
img_urls = item["img_urls"]
video_url = item["video_url"]
i = 0
if img_urls:
for u in img_urls:
with open("./data/糗事百科爬虫图片与视频数据/" + item["author"] + str(i) + ".jpg", "wb") as fp:
fp.write(requests.get(u, headers=self.headers).content)
i += 1
print("the image save to local successful...")
if video_url:
with open("./data/糗事百科爬虫图片与视频数据/" + item["author"] + str(i) + ".mp4", "wb") as fp:
fp.write(requests.get(video_url, headers=self.headers).content)
i += 1
print("the video save to local successful...")
#保存图片和视频方法
def save_item_toJson(self, item):
with open("./data/糗事百科爬虫数据.json", "a", encoding="utf-8") as fp:
json.dump(item, fp, ensure_ascii=False)
fp.write("\n")
print(item["author"] + "succesful save to local document of json...")
self.save_imgAndVideo(item)
#主方法
def run(self):
for i in range(1,10):
text = self.get_html_text(self.base_url.format(i))
detail_urls = self.parse_list_page(text)
for url in detail_urls:
text = self.get_html_text(url)
item = self.parse_detail_page(text)
self.save_item_toJson(item)
if __name__ == '__main__':
qs = QiuShi()
qs.run()
代码运行需要在当前目录下创建路径 data/糗事百科爬虫图片与视频数据/ 作为图片和视频存储位置
运行结果如下:
保存数据如下: