抓包分析,找到整页数据所在,可以看到是在"lang"下。
继续分析,找到每一本数据所在
分析完毕,进行编码爬取。
1.编写item
class BookItem(scrapy.Item):
# 小说名
book_name = scrapy.Field()
# 作者
author = scrapy.Field()
# 小说类型
book_type = scrapy.Field()
# 小说状态
book_state = scrapy.Field()
# 小说更新
book_update = scrapy.Field()
book_time = scrapy.Field()
# 最新一章地址
new_href = scrapy.Field()
# 小说简介
book_intro = scrapy.Field()
pass
2.编写Spider
from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import Selector
from Mycrawl.items import BookItem
import requests
class BookSpider(Spider):
# 爬虫名字,重要
name = 'book'
# 反爬措施
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
#url = "https://www.qidian.com/rank/yuepiao?style=1"
#start_urls = ['qidian.com']
def start_requests(self):
url = "https://www.qidian.com/rank/yuepiao?style=1"
yield Request(url, headers=self.headers, callback=self.parse)
def parse(self, response):
item = BookItem()
selector = Selector(response)
books = selector.xpath('//div[@class="book-mid-info"]')
for book in books:
name = book.xpath('h4/a/text()').extract()
author = book.xpath('p[@class="author"]/a[@class="name"]/text()').extract()
type = book.xpath('p[@class="author"]/a[@data-eid="qd_C42"]/text()').extract()
state = book.xpath('p[@class="author"]/span/text()').extract()
intro = book.xpath('p[@class="intro"]/text()').extract()
update = book.xpath('p[@class="update"]/a[@target="_blank"]/text()').extract()
href = book.xpath('p[@class="update"]/a/@href').extract()
time = book.xpath('p[@class="update"]/span/text()').extract()
item['book_name'] = name[0]
item['author'] = author[0]
item['book_type'] = type[0]
item['book_state'] = state[0]
item['book_update'] = update[0]
item['book_time'] = time[0]
item['new_href'] = 'https:' + href[0]
item['book_intro'] = ''.join(intro).replace(' ','').replace('\n','')
yield item
3.编写piplines连接数据库
Mysql
import pymysql
class BookPipeline(object):
def __init__(self):
# 连接数据库
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
db='TESTDB', charset='utf8')
# 建立游标对象
self.cursor = self.conn.cursor()
self.cursor.execute('truncate table Book')
self.conn.commit()
def process_item(self, item, spider):
try:
self.cursor.execute("insert into Book (book_name,author,book_type,book_state,book_update,book_time,new_href,book_intro) \
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", (item['book_name'], item['author'], item['book_type'],
item['book_state'], item['book_update'], item['book_time'],
item['new_href'], item['book_intro']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s,%s,%s,%s,%s,%s" % (item['book_name'], item['author'], item['book_type'],
item['book_state'], item['book_update'], item['book_time'],
item['new_href'], item['book_intro']))
return item
MongoDB
import pymongo
class BookPipeline(object):
def __init__(self):
# 连接数据库
self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
self.test = self.client['TESTDB']
self.post = self.test['movie']
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item
4.编写setting
BOT_NAME = 'Mycrawl'
SPIDER_MODULES = ['Mycrawl.spiders']
NEWSPIDER_MODULE = 'Mycrawl.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Mycrawl (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
#'Mycrawl.pipelines.MoviePipeline': 100,
'Mycrawl.pipelines.BookPipeline': 300,
}
5.结果
Mysql
MongoDB
本文只爬取了第一页的,大家可以自己尝试爬取所有页的数据。
爬虫项目地址:github