简单的爬虫,此代码具有模板的作用
要具有面向对象的特点,所以分开写每一部分
import requests
from lxml import etree
import json
class BB(object):
def __init__(self):
self.url="https://www.douban.com"
self.headers={
"User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"
}
#这里一定要注意cookies是字符串类型,而requests里面输入的cookies是字典类型,我们可以用下面的万能代码转换,或者是借助相关的工具Sublime Text
cookies='bid=CaE0F8bS1Fo; __gads=ID=909dc33bfcc2076a:T=1581476387:S=ALNI_MbZ4jrzEUZ-XOtc_K2keuVnphlCdA; push_noty_num=0; push_doumail_num=0; ll="108288"; _vwo_uuid_v2=DCA8B8479EC9962DA9583DB109414C6BC|2b6f125f8df85eaa8526a461fb2f8d70; __yadk_uid=5gtjCnoJMeEhSDeRjRByUx4Zz8ViPcCi; __utmv=30149280.21093; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1581867944%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPlE9kQOeLGCLm0IBa75DE_QSqt08PlSJ3AAFebIZSxUiescU8j0_1K20xwHD0q8f%26wd%3D%26eqid%3Db1c87716012299e2000000035e4963a3%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1767680939.1581476455.1581743959.1581867945.7; __utmc=30149280; __utmz=30149280.1581867945.7.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; dbcl2="210933638:HvbCs6Bvse4"; ck=oxtR; ap_v=0,6.0; _pk_id.100001.8cb4=cb3b8107db3f7bda.1581735902.5.1581868163.1581863253.; __utmb=30149280.5.10.1581867945'
self.cook_dict = {}
cookies_list = cookies.split("; ")
for cookie in cookies_list:
self.cook_dict[cookie.split("=")[0]] = cookie.split("=")[1]
#1、发送请求 注意翻页的话url是可变的
def get_response(self,url):
response = requests.get(url,headers=self.headers,cookies=self.cook_dict)
data = response.content.decode("utf-8")
return data
#2、解析数据
def parse_data(self,data):
x_data=etree.HTML(data)
title_list=x_data.xpath("//a[@class='rec_topics_name']/text()")
url_list=x_data.xpath("//a[@class='rec_topics_name']/@href")
data_list=[]
for index,title in enumerate(title_list):
news={}
news['title']=title
news['url']=url_list[index]
data_list.append(news)
return data_list
#3、保存数据
#注意文件里面不能写入数组,所以我们利用json将其转化成字符串
def save_data(self,data):
data_str = json.dumps(data)
with open("qqq.html","w",encoding="utf-8")as f:
f.write(data_str)
def run(self):
#for i in range(5):
#拼接url
url1 = self.url
#发送请求
data = self.get_response(url1)
#解析数据
parse_data = self.parse_data(data)
self.save_data(parse_data)
BB().run()