一、搭建项目
1.新建python项目jobman
2.生成scrapy
进入存储代码的jobman目录中,运行下列cmd命令:
scrapy startproject zhaopin
二、编辑源码
修改3个文件内容:
items.py
pipelines.py
settings.py
新建2个文件:
wuyaojob_spider.py
mssql.py
1).修改items.py为如下内容
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class CompanyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
domain=scrapy.Field()
url=scrapy.Field()
comname=scrapy.Field()
typename=scrapy.Field()
comdemo=scrapy.Field()
areaname=scrapy.Field()
address=scrapy.Field()
tel=scrapy.Field()
email=scrapy.Field()
class JobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
domain=scrapy.Field()
comurl=scrapy.Field()
url=scrapy.Field()
comname=scrapy.Field()
jobname=scrapy.Field()
jobdemo=scrapy.Field()
areaname=scrapy.Field()
address=scrapy.Field()
salary=scrapy.Field()
eduname=scrapy.Field()
jobyear=scrapy.Field()
begindate=scrapy.Field()
enddate=scrapy.Field()
fuli=scrapy.Field()
tradename=scrapy.Field()
2)修改piplines.py如下:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from zhaopin.items import CompanyItem,JobItem
from zhaopin.mssql import MsSql
class ZhaopinPipeline(object):
def process_item(self, item, spider):
if isinstance(item,CompanyItem):
ms=MsSql(host="192.168.0.28",user="sa",pwd="sa",db="TalCrawl")
if len(item['comname'])==0:
pass
else:
newsql="insert into cra_craw_comdata(domain,url,comname)values('%s','%s','%s')" % (item['domain'],item['url'],item['comname'])
print(newsql)
#ms.ExecNonQuery(newsql.encode('utf-8'))
elif isinstance(item,JobItem):
if len(item['jobname'])==0 or len(item['comname'])==0:
pass
else:
ms=MsSql(host="192.168.0.28",user="sa",pwd="sa",db="TalCrawl")
newsql="insert into cra_craw_jobdata(domain,url,jobname,comurl,comname,salary,eduname,fuli,jobdemo)\
values('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % \
(item['domain'],item['url'],item['jobname'],item['comurl'],item['comname'],item['salary'],item['eduname'],item['fuli'],item['jobdemo'])
print(newsql)
#ms.ExecNonQuery(newsql.encode('utf-8'))
else:
pass
return item
3)修改settings.py中的部分内容如下:
ITEM_PIPELINES = {
'zhaopin.pipelines.ZhaopinPipeline': 300,
}
4).在zhaopin/spiders目录下新建文件wuyaojob_spider.py
#-*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from ..items import CompanyItem,JobItem
class wuyaojobSpider(scrapy.Spider):
name="wuyaojob"
#allowed_domains=["51job.com"]
domain="51job.com"
start_urls=['http://51job.com']
#rules = (
# ## 提取匹配 'category.php' (但不匹配 'subsection.php') 的链接并跟进链接(没有callback意味着follow默认为True)
# #Rule(LinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
# # 提取匹配 'item.html' 的链接并使用spider的parse_company方法进行分析
# Rule(LinkExtractor(allow=('\.html', )), callback='parse_company'),
# Rule(LinkExtractor(allow=('\.htm', )), callback='parse_job'),
#)
def start_requests(self):
requests = []
for item in self.start_urls:
requests.append(scrapy.Request(url=item, headers={'Referer':'http://www.baidu.com/'},callback=self.parse ))
return requests
def parse(self,response):
#self.logger.info("访问地址:%s" % response.xpath('//a/@href').extract())
for url in response.xpath('//a/@href').extract():
if url.endswith('index.htm'):
yield scrapy.Request(url, headers={'Referer':response.url},callback=self.parse_company)
elif url.endswith('.html') and url.startswith('http://jobs.51job.com/all/'):
yield scrapy.Request(url, headers={'Referer':response.url},callback=self.parse_job)
else:
yield
return None
def parse_company(self,response):
links=response.xpath('//a')
items=[]
for link in links:
item = CompanyItem()
item['domain']=self.domain
item['url'] = link.xpath('.//@href').extract()
if len(item['url'])>0:
item['url']=item['url'][0]
else:
item['url']=''
item['comname']=link.xpath('.//text()').extract()
if len(item['comname'])>0:
item['comname']=item['comname'][0]
else:
item['comname']=''
items.append(item)
#self.logger.info('地址: %s 名称:%s '%(item['url'],item['comname']))
return items
def parse_job(self,response):
#self.log('职位页面地址: %s' % response.url)
item = JobItem()
item['domain']=self.domain
item['url'] =response.url
#/html/body/div[2]/div[2]/div[2]/div/div[1]/h1
item['jobname']=response.xpath('//div[@class="cn"]//h1//text()').extract()
if len(item['jobname'])>0:
item['jobname']=item['jobname'][0]
else:
item['jobname']=''
item['comname']=response.xpath('//p[@class="cname"]//a//text()').extract()
if len(item['comname'])>0:
item['comname']=item['comname'][0]
else:
item['comname']=''
item['comurl']=response.xpath('//p[@class="cname"]//a//@href').extract()
if len(item['comurl'])>0:
item['comurl']=item['comurl'][0]
else:
item['comurl']=''
item['salary']=response.xpath('//div[@class="cn"]//strong//text()').extract()
if len(item['salary'])>0:
item['salary']=item['salary'][0]
else:
item['salary']=''
item['fuli']=response.xpath('//p[@class="t2"]//text()').extract()
if len(item['fuli'])>0:
item['fuli']=item['fuli'][0]
else:
item['fuli']=''
item['jobdemo']=response.xpath('//div[@class="bmsg job_msg inbox"]//text()').extract()
if len(item['jobdemo'])>0:
item['jobdemo']=item['jobdemo'][0]
else:
item['jobdemo']=''
item['eduname']=response.xpath('//div[@class="t1"]/span[2]//text()').extract()
if len(item['eduname'])>0:
item['eduname']=item['eduname'][0]
else:
item['eduname']=''
item['tradename']=response.xpath('//p[@class="fp f2"]/span[@class="el"]//text()').extract()
if len(item['tradename'])>0:
item['tradename']=item['tradename'][0]
else:
item['tradename']=''
return item
5)最后附上一个辅助类文件mssql.py
放在zhaopin目录下与spiders目录平齐。
# -*- coding:utf-8 -*-
import pymssql
class MsSql:
def __init__(self,host,user,pwd,db):
self.host = host
self.user = user
self.pwd = pwd
self.db = db
def __GetConnect(self):
if not self.db:
raise(NameError,"数据库不存在")
self.conn = pymssql.connect(host=self.host,user=self.user,password=self.pwd,database=self.db,charset="utf8")
cur = self.conn.cursor()
if not cur:
raise(NameError,"账号或密码错误")
else:
return cur
def ExecQuery(self,sql):
cur = self.__GetConnect()
cur.execute(sql)
resList = cur.fetchall()
self.conn.close()
return resList
def ExecNonQuery(self,sql):
cur = self.__GetConnect()
cur.execute(sql)
self.conn.commit()
self.conn.close()
三、运行蜘蛛:
cmd切换到zhaopin目录下执行以下命令
scrapy crawl wuyaojob
至于多个蜘蛛同时运行的方法,以及采用python代码运行的方法,后面再做研究。
scrapy一个综合性的demo案例
最新推荐文章于 2024-08-16 21:43:10 发布