#取得李毅吧下所有精品贴名称,保存 json文件到项目目录
1.创建项目:
$ scrapy startproject tutorial # tutorial :project_name
2.定义Item:
from scrapy.item import Item, Field
class TiebaItem(Item):
# define the fields for your item here like:
name = Field()
description = Field()
url = Field()
3.新建tieba_splder文件: 在Splder文件夹下新建
from scrapy.spider import Spider
from scrapy.selector import Selector
from tutorial.items import DmozItem
from scrapy.http import Request
class tiebaSpider(Spider):
name = "tieba"
# allowed_domains = ["dmoz.org"]
# download_delay = 2 #下载延迟
start_urls = [
"http://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85&ie=utf-8&tab=good&cid=0&pn=0"
]
def parse(self, response):
sel = Selector(response)
# sites = sel.xpath('//ul[@class="directory-url"]/li')
sites=sel.xpath('//div/a[@class="j_th_tit "]')
for site in sites:
item = DmozItem()
item['description'] = site.xpath('text()').extract()
yield item
#下一页
nextLink = sel.xpath('//div[@class="pagination-default clearfix"]/a[@class="next pagination-item "]/@href').extract()
if nextLink: #判断下一页链接是否存在
nextLink = nextLink[0] #取出链接
yield Request(nextLink, callback=self.parse)
4.设置 Pipelines,保存为"scraped_data_utf8.json"文件,
import json
import codecs
class FilterWordsPipeline(object):
def __init__(self):
# self.file = open('items.jl', 'wb')
self.file = codecs.open(
'scraped_data_utf8.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
5.设置 settings.py,
ITEM_PIPELINES = {
'tutorial.pipelines.FilterWordsPipeline': 500, #对应的 Pipelines 中的class
}