1、豆瓣网美剧电影评分爬取
from parse import parse_url
import json
class DoubanSpider:
def __init__(self):
self.temp_url = "https://movie.douban.com/j/search_subjects?type=movie&tag=欧美&sort=recommend&page_limit=20&page_start={}"
def get_content_list(self,html_str):
dict_data = json.loads(html_str)
content_list = dict_data["subjects"]
print(type(content_list)) #content_lsit是一个列表形式的,里面存的是一序列字典类型的值
return content_list
def save_content_list(self,content_list):
with open("douban.json","a",encoding="utf-8") as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False)) #将python字典类型的数据转换成json类型,便于进行文件存储
f.write("\n")
print("保存成功")
def run(self): #实现主要逻辑
num = 0
total = 200
while num<total+20 :
#1.start_url
url = self.temp_url.format(num)
#2.发送请求,获取响应、
html_str = parse_url(url)
#3.提取数据
content_list = self.get_content_list(html_str)
#4.保存
self.save_content_list(content_list)
#5.构造下一页的url地址,循环2-5步
num +=20
if __name__ == '__main__':
douban = DoubanSpider()
douban.run()
2、糗事百科段子爬取
from lxml import etree
import requests
import json
class QiubaiSpidera():
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
self.header = {"User-Agent":"Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/70.0.3538.110 Safari/537.36"}
def get_url_list(self):
url_list = [self.url_temp.format(i) for i in range(1,14)]
return url_list
def parse_url(self,url):
response = requests.get(url,headers=self.header)
return response.content.decode()
def get_content_list(self,html_str):
html = etree.HTML(html_str)
#1、分组
div_list = html.xpath("//div[@class='col1 old-style-col1']/div")
content_list = []
for div in div_list:
item= {}
item["auther_name"] = div.xpath(".//h2//text()")[0].strip() if len(div.xpath(".//h2//text()"))>0 else None
item["content"] = div.xpath(".//span//text()")[0].strip()
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"] )>0 else None
content_list.append(item)
return content_list
def save_content_list(self,content_list): #数据保存
with open("qiubai.txt","a",encoding="utf-8") as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write("\n")
print("保存成功")
def run(self): #实现主要逻辑
#1、根据url规律构造url
url_list = self.get_url_list()
#2、发送请求,获取数据
for url in url_list:
html_str = self.parse_url(url)
#3、提取数据
content_list = self.get_content_list(html_str)
#4、保存
self.save_content_list(content_list)
if __name__ == '__main__':
qiubai = QiubaiSpidera()
qiubai.run()