xlb.py
import scrapy
import re
from test_spider. items import TestSpiderItem
class XlbSpider ( scrapy. Spider) :
name = 'xlb'
allowed_domains = [ 'xiaohua.zol.com.cn/' ]
start_urls = [ 'http://xiaohua.zol.com.cn/lengxiaohua/1.html' ]
base_domain = 'http://xiaohua.zol.com.cn'
def parse ( self, response) :
"""
response 是一个 scrapy.http.response.html.HtmlResponse 对象,可以执行 xpath 和 css 语法来提取数据
提取出来的数据是一个 SelectorList 对象,如果想要获取其中的字符串,用 getall() 或者 get()
getall():获取 Selector 中的所有文本,返回的是一个列表
get():获取 Selector 中的第一个文本,返回的是 str
如果数据解析回来,要传给 pipeline 处理,可以使用 yield 返回,也可以将数据 append 到列表中,统一进行 return
item:建议在 items.py 中定义好模型,以后就不要使用字典
pipeline:这个是专门用来保存数据的,其中有三个方法是会经常用到的
1. open_spider(self,spider):当爬虫被打开时执行
2. process_item(self,item,spider):当爬虫有 item 传过来的时候会被调用
3. close_spider(self,spider):当爬虫关闭的时候会被调用
要激活 pipeline,应该在 settings.py 中,设置 ITEM_PIPELINES
"""
content_left = response. xpath( '//ul[@class="article-list"]/li[@class="article-summary"]' )
for content in content_left:
author = content. xpath( './/span[@class="article-title"]/a[@target="_blank"]/text()' ) . get( )
conts = content. xpath( './/div[@class="summary-text"]//text()' ) . getall( )
cont = '' . join( conts) . replace( '\t' , '' ) . replace( '\r' , '' ) . replace( '\n' , '' )
item = TestSpiderItem( author= author, content= cont)
yield item
next_url = response. xpath( '//div[@class="page"]/a[@class="page-next"]/@href' ) . get( )
next_url = self. base_domain + next_url
if '10' in next_url:
return
yield scrapy. Request( next_url, callback= self. parse, dont_filter= True )
settings.py
. . .
DOWNLOAD_DELAY = random. randint( 1 , 3 )
. . .
DEFAULT_REQUEST_HEADERS = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
'Accept-Language' : 'en' ,
'User-Agent' : 'Mozilla/ 5.0 ( X11; Linux x86_64) AppleWebKit/ 537.36 ( KHTML, like Gecko) Chrome/ 78.0 .3904 .108 \
Safari/ 537.36 '
}
. . .
ITEM_PIPELINES = {
'test_spider.pipelines.TestSpiderPipeline' : 300 ,
}
. . .
pipelines.py
import json
from scrapy. exporters import JsonItemExporter, JsonLinesItemExporter
class TestSpiderPipeline ( object ) :
def __init__ ( self) :
self. fp = open ( 'duanzi.json' , 'wb' )
self. exporter = JsonLinesItemExporter( self. fp, ensure_ascii= False , encoding= 'utf8' )
def open_spider ( self, spider) :
print ( 'spider is running!' )
def process_item ( self, item, spider) :
self. exporter. export_item( item)
return item
def close_spider ( self, spider) :
self. fp. close( )
print ( 'spider was closed!' )
items.py
import scrapy
class TestSpiderItem ( scrapy. Item) :
author = scrapy. Field( )
content = scrapy. Field( )