刚开始学习这个框架,第一个使用Spider写的爬虫:通过关键字,抓取杭州人才网的信息。代码如下
item.py
#处理空格
def str_strip(str):
return str.strip().replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '')
def except_name(contact):
#因为每个内容都是一个list所以需要去除list的值
if (':' in contact):
contact = contact[contact.index(':') + 1:]
return contact
class hzTalentItemLoad(ItemLoader):
#设置默认的输出方法
# 注意有些字段的输出方法不是取第一个,需要覆盖掉输出处理器的方法
default_output_processor = TakeFirst()
class hzTalent(scrapy.Item):
title=scrapy.Field()
wages=scrapy.Field(
input_processor=MapCompose(str_strip)
)
contact=scrapy.Field(
input_processor=MapCompose(except_name),
)
tel=scrapy.Field(
input_processor=Join(';'),
output_processor=Identity(),
)
email=scrapy.Field()
company=scrapy.Field()
description=scrapy.Field(
input_processor=MapCompose(str_strip)
)
url=scrapy.Field()
pipeline.py
class notNullPipeLine(object):#删除空的item
def process_item(self, item, spider):
if item['title']:
return item
else:
raise DropItem('miss price in %s' % item)
class notRepeatPipeLine(object):#删除重复的item
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
url_list=item['url']
if url_list in self.ids_seen:
raise DropItem(" item found: %s" % item)
else:
self.ids_seen.add(url_list)
return item
class JsonPipeLine(object):#导出json
def __init__(self):
self.file=codecs.open('E:\\hztalent.json','wb',encoding='utf-8')#保存为json文件
def process_item(self,item,spider):
print(item)
line=json.dumps(dict(item),ensure_ascii=False)+"\n" #写入文件
self.file.write(line)
return item
def spider_close(self):#结束关闭文件
self.file.close()
setting.py
ITEM_PIPELINES = {
#启动pipeline
'scrapyDemo.pipelines.notNullPipeLine': 100,
'scrapyDemo.pipelines.notRepeatPipeLine': 200,
'scrapyDemo.pipelines.JsonPipeLine': 300,
}
spider.py
class hzTalents(scrapy.spiders.Spider):
name='hzTalent'
# allowed_domains=['www.hzrc.com']
# download_delay=1
search_name=['python']
cookie='UM_distinctid=16381d9bcac81-087e1a93ca16be-4c312a7a-13c680-16381d9bcae1cd; CNZZDATA2145298=cnzz_eid%3D1012120389-1526889235-%26ntime%3D1542603351; _ubm_id.d2cc513d10e26176994c26da25947ea2=4c8802701e1cfdd3; Hm_lvt_46b0265c3ac55b35a0fc9e4683094a94=1542346050; _ubm_ref.d2cc513d10e26176994c26da25947ea2=%5B%22%22%2C%22%22%2C1542608220%2C%22http%3A%2F%2Fwww.ksousou.cn%2FDetail%2F5124541.html%22%5D; JSESSIONID=ph4qn_0U7zpPPYcoTZp28UaZr43vQFEEsUy4EMThsVjUmbL2K0es!1789549506; Hm_lpvt_46b0265c3ac55b35a0fc9e4683094a94=1542608372; longinUser=wc; _ubm_ses.d2cc513d10e26176994c26da25947ea2=*'
post_url='http://www.hzrc.com/ww/b/c/wwbc_result.html'
def start_requests(self):
self.cookie = DocumentUtil.change_string_to_dict(self.cookie,';')
# 通过FormRequest发送post请求 formdata中的参数 只能是key value类型 这里添加了dont_filter参数,不过滤该请求,
# 因为这次请求没获取链接数据,可以在循环中再次请求
for name in self.search_name:
requests= scrapy.FormRequest(url=self.post_url,dont_filter=True,cookies=self.cookie,formdata=
{'option':'',
'type':'',
'pageNo':'1',
'ishcj':'',
'aca111':'',
'aab301':'',
'acc217':'',
'aac011':'',
'aab056':'',
'aab020':'','acb241':'','aae396':'','acb239':'','acb210s':'',
'conditionsall':'',
'addtosearch':'',
'aca112':'','aab010':'','keyword':name
},callback=self.get_page)
#将参数 name传递到回调函数
requests.meta['name']=name
yield requests
# 获取页数
def get_page(self,response):
# 获取传递过来的搜索关键字
name=response.meta['name']
# 获取页数
page=1
page_str=response.xpath('//div[@class="pagebar_wrap"]/span[last()-2]/text()').extract()
if page_str:
info="".join(page_str)
page=int(info)
#循环页数 可以从第一页开始
for page_num in range(1,page+1):
#formdata的参数 不能为int类型
num_str=str(page_num)
yield scrapy.FormRequest(url=self.post_url,cookies=self.cookie,formdata=
{'option':'',
'type':'',
'pageNo':num_str,
'ishcj':'',
'aca111':'',
'aab301':'',
'acc217':'',
'aac011':'',
'aab056':'',
'aab020':'','acb241':'','aae396':'','acb239':'','acb210s':'',
'conditionsall':'',
'addtosearch':'',
'aca112':'','aab010':'','keyword':name
},callback=self.get_list_info)
# 获取列表内容
def get_list_info(self,response):
# 获取链接
allInfoList = response.xpath('//li[@id="aa"]')
for each in allInfoList:
detail_str = each.xpath('div/div[@class="bg4_2"]/a/@onclick').extract()
detail_info = "".join(detail_str)
# 正则匹配 id
url_id = RegexUtil.match_between_sign(detail_info, '\'', '\')')
detail_url ='http://www.hzrc.com/ww/b/c/wwbc_jobdeatils.html?acb210='+url_id+''
yield scrapy.Request(url=detail_url, cookies=self.cookie, callback=self.detail_info)
# 获取详情内容
def detail_info(self,response):
#通过itemLoad方式填充数据 这里因为使用要修改默认的输出处理器,所以通过自己写的itemload实现
itcastItemLoader = hzTalentItemLoad(item=hzTalent(), response=response)
itcastItemLoader.add_value('url',response.url)
itcastItemLoader.add_xpath('title','//div[@class="postcom"]/span[@class="postname"]/text()')
itcastItemLoader.add_xpath('company', '//div[@class="postcom"]/span[@class="comname"]/text()')
itcastItemLoader.add_xpath('wages', '//div[@class="postpartdiv1"][1]/div[2]/text()')
#description=response.xpath('string(//div[@class="postpartcon"])').extract()
itcastItemLoader.add_xpath('description', 'string(//div[@class="postpartcon"])')
itcastItemLoader.add_xpath('contact', '//div[@class="postcontact"]/span[1]/text()')
itcastItemLoader.add_xpath('tel', '//div[@class="postcontact"]/span[2]/span/text()')
itcastItemLoader.add_xpath('tel', '//div[@class="postcontact"]/span[3]/span/text()')
itcastItemLoader.add_xpath('email', '//div[@class="postcontact"]/span[4]/span/text()')
itcast_load=itcastItemLoader.load_item()
yield itcast_load
其他方法:
#将传入的cookieStr 转化为dict
def change_string_to_dict(cookie,symbol='&&'):
cookie_dict={}
for line in cookie.split(symbol):
# 按照字符:进行划分读取
# 其设置为1就会把字符串拆分成2份
name, value = line.strip().split('=', 1)
cookie_dict[name] = value # 为字典cookies添加内容
# index=line.split("=")
# cookie_dict[line[0:index]]=line[index:]
return cookie_dict
# 匹配2个字符中间的内容 matchLast 为true 匹配到最后一个,false 匹配到第一个last
@staticmethod
def match_between_sign(text, prefix, suffix, index=0, suffix_more_char=False):
pattern = re.compile(prefix + r'([^' + suffix + ']+)')
if suffix_more_char:
pattern = re.compile(prefix + r'(.*?)' + suffix)
index = 0
match = re.findall(pattern, text)
if len(match) > 0:
return match[index]
else:
return None
本例中注意事项:
1.spider 中 翻页操作 从第一页开始的化 由于在getpage方法是 第一页已经请求过一次,所以需要加上dont_filter参数。
2.使用itemLoad的默认处理器 需要写一个类 继承ItemLoad 并在爬虫中实现时,使用该类。
3.ItemLoad add_****返回的值都是一个list,处理数据时要注意
附:itemLoad 介绍 https://blog.csdn.net/zwq912318834/article/details/79530828
request介绍 https://yq.aliyun.com/articles/623230?utm_content=m_1000011825