# -*- coding: utf-8 -*-
import scrapy
import re
from ..items import CategrayItem,ArticleItem
class CsdnSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['csdn.net']
start_urls = ['http://blog.csdn.net/column.html']
base_url = 'http://blog.csdn.net'
def parse(self, response):
# 判断是哪一个请求
if 'column' in response.url :
divs = response.xpath('//div[contains(@data-mod,"popu_276")]')
for div in divs:
img_src = div.xpath('div[@class="column_bg"]/@style').extract_first()
# print(img_src)
pattern = re.compile('\((http.*?)\)')
rs = re.search(pattern,img_src).group(1)
link = div.xpath('a[@class="column_list_link"]/@href').extract_first('')
# 拼接完整的url 返回请求对象
yield scrapy.Request(
url=self.base_url+link,
meta={'img_src':rs},
callback=self.parse_detail
)
elif 'home' in response.url:
print('论坛界面')
# 解析详情页面
def parse_detail(self, response):
# 分类标题
categray = response.xpath('//h3/a/text()').extract_first('')
# 将封面单独下载到本地,保存到分类文件夹下
cate = CategrayItem()
cate['categray'] = categray
cate['img_src'] = [response.meta['img_src']]
yield cate
lis = response.xpath('//ul[@class="detail_list"]/li')
# for循环遍历
for li in lis:
art_url = li.xpath('h4/a/@href').extract_first('')
art_title = li.xpath('h4/a/text()').extract_first('')
date = li.xpath('div[@class="detail_b"]/span/text()').extract_first('')
look_number = li.xpath('div[@class="detail_b"]/em/text()').extract_first('')
# 处理item 具体到文章信息的数据
item = ArticleItem()
item['categray'] = categray
item['art_url'] = [art_url]
item['date'] = date
item['look_number'] = look_number
item['art_title'] = art_title
yield item
import scrapy
import re
from ..items import CategrayItem,ArticleItem
class CsdnSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['csdn.net']
start_urls = ['http://blog.csdn.net/column.html']
base_url = 'http://blog.csdn.net'
def parse(self, response):
# 判断是哪一个请求
if 'column' in response.url :
divs = response.xpath('//div[contains(@data-mod,"popu_276")]')
for div in divs:
img_src = div.xpath('div[@class="column_bg"]/@style').extract_first()
# print(img_src)
pattern = re.compile('\((http.*?)\)')
rs = re.search(pattern,img_src).group(1)
link = div.xpath('a[@class="column_list_link"]/@href').extract_first('')
# 拼接完整的url 返回请求对象
yield scrapy.Request(
url=self.base_url+link,
meta={'img_src':rs},
callback=self.parse_detail
)
elif 'home' in response.url:
print('论坛界面')
# 解析详情页面
def parse_detail(self, response):
# 分类标题
categray = response.xpath('//h3/a/text()').extract_first('')
# 将封面单独下载到本地,保存到分类文件夹下
cate = CategrayItem()
cate['categray'] = categray
cate['img_src'] = [response.meta['img_src']]
yield cate
lis = response.xpath('//ul[@class="detail_list"]/li')
# for循环遍历
for li in lis:
art_url = li.xpath('h4/a/@href').extract_first('')
art_title = li.xpath('h4/a/text()').extract_first('')
date = li.xpath('div[@class="detail_b"]/span/text()').extract_first('')
look_number = li.xpath('div[@class="detail_b"]/em/text()').extract_first('')
# 处理item 具体到文章信息的数据
item = ArticleItem()
item['categray'] = categray
item['art_url'] = [art_url]
item['date'] = date
item['look_number'] = look_number
item['art_title'] = art_title
yield item