起点的爬虫没有什么特殊的难点,就以基本的爬虫步骤简单的看一下,不说上代码
// A code block
import csv
import json
from lxml import etree
import requests
class QiDianSpider(object):
def __init__(self):
self.base_url="https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page={}"
self.headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
}
self.items=[]
def get_url_list(self):
return [self.base_url.format(start)for start in range(0,201)]
def save_item(self,item):
self.items.append(item)
def run(self):
url_list=self.get_url_list()
for url in url_list:
response = requests.get(url,headers=self.headers)
html = response.content.decode("utf-8")
eroot = etree.HTML(html)
rows= eroot.xpath('//ul[@class="all-img-list cf"]/li')
print(rows)
for row in rows:
item={}
item["title"]=row.xpath('.//h4/a')[0].xpath('./text()')[0]
item["author"] = row.xpath('./div[@class="book-mid-info"]/p[@class="author"]//a[@class="name"]')[0].xpath('./text()')[0]
print(item)
self.save_item(item)
with open('起点.json', 'w', encoding='utf-8') as f:
json.dump(self.items, f, ensure_ascii=False, indent=2)
# 实现保存 csv 格式文件
# 1. 创建一个写入文件对象
out_file = open('起点.csv', 'w', encoding='utf-8')
# 2. 创建 csv 写入对象
writer = csv.writer(out_file)
# 写入标题
writer.writerow(self.items[0].keys())
# 写入内容
for item in self.items:
writer.writerow(item.values())
out_file.close()
pass
if name == ‘main’:
spider=QiDianSpider()
spider.run()
爬虫面向对象的思想