**spider目录下的文件:定义DemoSpider类**
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from img.items import ImgItem
#from bs4 import BeautifulSoup
#import urllib
#import requests
class DemoSpider(CrawlSpider):
name='demo'
start_urls = ['http://www.xiaohuar.com/list-1-2.html']
"""第一个Rule是用来筛选所有的网页
第二个是用来搜索当前页面的所有校花的子url
allow里面的是正则表达式索引带有这个正则的url
restrict_xpaths限定的是搜索的范围
callback回调函数,用来处理页面
process_links用来定义出来url的链接,其中定义的函数要传入参数links
follow是用来定义是否跟进
"""
rules={Rule(LinkExtractor(allow=('http://www.xiaohuar.com/list'),
restrict_xpaths=("//div[@class='page_num']")),
#callback="paser_url",
follow=True),
Rule(LinkExtractor(allow='/p',restrict_xpaths="//div[@class='title']"),
callback="paser_item",
follow=False
)
}
def paser_item(self,response):
item=ImgItem()
url=response.url
print "url=%s"%url
#检查异常
try:
img_url=response.xpath("//div[@class='infoleft_imgdiv']/a/img/@src").extract()[0]
name=response.xpath("//div[@class='infodiv']/table/tbody/tr[1]/td[2]/text()").extract()
school=response.xpath("//div[@class='infodiv']/table/tbody/tr[5]/td[2]/text()").extract()
if 'http://www.xiaohuar.com' not in img_url:
item['url'] = 'http://www.xiaohuar.com'+img_url
else:
item['url']=img_url
item['name'] = name
item['school'] = school
yield item
except Exception:
print 'error'
**定义items文件**
import scrapy
class ImgItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
url=scrapy.Field()
name=scrapy.Field()
school=scrapy.Field()
*******定义pipelines文件***********
import codecs
import json
import urllib
import os
class ImgPipeline(object):
def __init__(self):
self.file=codecs.open('items.json','w',encoding='utf-8') #以json文件的方式打开,编码为utf-8,否则会乱码
# self.file_path=os.path.normpath("h:\\scrapy\\img\\img_picture")
# self.count=1
def process_item(self, item, spider): #必须实现的函数,用于操作item
line=json.dumps(dict(item),ensure_ascii=False)+'\n' #将item中的每一个数据转换成json格式的并且每一个数据都要换行娴熟
# if not os.path.exists(self.file_path):
# os.mkdir(self.file_path)
# img_name=os.path.normpath("h:\\scrapy\\img\\img_picture\\%s.jpg"%self.count)
# urllib.urlretrieve(item['url'],img_name)
# self.count+=1
self.file.write(line)
return item #最后一般都要返回item,以便后续还要操作item
def close_file(self):
self.file.close()
***********seetting****************
#在setting文件中加上下面这句话
ITEM_PIPELINES={
"img.pipelines.ImgPipeline":300
}
scrapy框架爬取校花网站的升级版
最新推荐文章于 2021-12-12 08:00:00 发布