Scrapy爬取IT之家-CSDN博客

创建项目

scrapy startproject ithome

创建CrawSpider

scrapy genspider -t crawl IT ithome.com

items.py

1 import scrapy
2 
3 
4 class IthomeItem(scrapy.Item):
5     # define the fields for your item here like:
6     # name = scrapy.Field()
7     title = scrapy.Field()
8     content = scrapy.Field()

it.py

 1 import scrapy
 2 from scrapy.linkextractors import LinkExtractor
 3 from scrapy.spiders import CrawlSpider, Rule
 4 from ithome.items import IthomeItem
 5 
 6 class ItSpider(CrawlSpider):
 7     name = 'it'
 8     allowed_domains = ['ithome.com']
 9     start_urls = ['https://it.ithome.com/ityejie/']
10 
11     rules = (
12         Rule(LinkExtractor(allow=r'ityejie/'), follow=True),
13         Rule(LinkExtractor(allow=r'html/it/\d+.htm', restrict_xpaths='//*[@id="wrapper"]//*[@class="block"]'), callback='parse_item', follow=True),
14     )
15 
16     def parse_item(self, response):
17         i = {}
18         #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
19         #i['name'] = response.xpath('//div[@id="name"]').extract()
20         #i['description'] = response.xpath('//div[@id="description"]').extract()
21 
22         i['title'] = response.xpath('//*[@id="wrapper"]/div[1]/div[2]/h1/text()').extract()[0]
23         i['content'] = response.xpath('//*[@id="paragraph"]/p/text()').extract()
24         yield i

pipelines.py

 1 import json
 2 
 3 class IthomePipeline(object):
 4 
 5 
 6     def __init__(self):
 7         self.filename = open("it.json", "w")
 8 
 9     def process_item(self, item, spider):
10         text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
11         self.filename.write(text)
12         return item
13 
14     def close_spider(self, spider):
15         self.filename.close()

settings.py

 1 BOT_NAME = 'ithome'
 2 
 3 SPIDER_MODULES = ['ithome.spiders']
 4 NEWSPIDER_MODULE = 'ithome.spiders'
 5 
 6 
 7 
 8 # Obey robots.txt rules
 9 ROBOTSTXT_OBEY = False
10 
11 ITEM_PIPELINES = {
12     'ithome.pipelines.IthomePipeline': 300,
13 }