#coding=utf-8
import scrapy
import time
import re
from qqcrawler.items import QqcrawlerItem
class QzoneSpider(scrapy.Spider):
name = "qzone"
# allowed_domains = ["qzone.qq.com/"]
start_urls = [
# "http://www.ncst.edu.cn/"
"http://qzone.qq.com/"
# ,"http://www.qq.com/"
]
def parse(self, response):
try:
qq_item = QqcrawlerItem() #爬取的数据
qq_item['c_time'] = time.time()
qq_item['url'] = response.url
if response.xpath('/html/head/title'):
qq_item['title'] = response.xpath('/html/head/title').extract()
else:
qq_item['title']=None
yield qq_item
if response.xpath('//@href'):
for i in response.xpath('//@href').extract():
if re.match('^http.*qzone\.qq.*',i):
print i,'================'
yield scrapy.Request(i, callback=self.parse) #继续向爬虫中添加url
except:
print ''
Scrapy爬虫Demo
最新推荐文章于 2022-02-25 15:25:09 发布