爬虫项目——拉勾上坐标北京的18804条python职位

最终爬取结果截图

共爬取数据18804条
在这里插入图片描述

创建数据库文件

class Crawldb:
	def __init__(self):
			self.db_file = 'lago_python.db'
			self.table = 'lago'
			
			self.create_db_file()
			self.conn = sqlite3.connect('lago_python.db')
			self.create_table
			

创建数据库文件

	def create_db_file(self):
		if not os.path.exits(self.db_file):
			f = open(self.db.db_file,'w+')
			f.close()
		return True


创建表

	def create_table(self):
		cursor = self.conn.cursor()
		sql = "DROP TABLE IF EXISTS'{table}'".format(table=self.table)
		cursor.execute(sql)

		sql = "create table %s (id integer primary key not null, " \
              "district varchar(25) not null, biz_area varchar(25) not null, createTime varchar(25) not null, " \
              "companyShortName varchar(25) not null,companySize varchar(25) not null, industryField varchar(25) not null," \
              "positionName varchar(25) not null, firstType varchar(25) not null, secondType varchar(25) not null,  " \
              "salary varchar(25) not null, workYear varchar(25) not null,education varchar(25) not null, positionId varchar(25) not null)" % self.table
              cursor.execute(sql)
              cursor.close()
              self.conn.commit()     
              
		return True

插入数据

	def insert_data(self, qrgs):
		sql = "insert into `lago_` (`district`, `biz_area`, `createTime`," \
              "`companyShortName`,`companySize`,`industryField`," \
              " `positionName`, `firstType`,`secondType`," \
              " `salary`, `workYear`, `education`, `positionId`) values "
        sql += "('%s', '%s','%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % args
        
        row = self.conn.cursor().execute(sql).rowcount
        self.conn.cursor().close()
        self.conn.commit()
        
        return row

爬取拉勾

class Crawl:
	def __init__(self):
		self.city = '北京'
		self.db = Crawldb()
		self.district = dict()

		
		self.headers=[
		     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
		]
		self.browesers = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ",
            "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) ",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ",
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
        ]

爬取行政区信息

	def get_administrative(self):
		url = "https://www.lagou.com/jobs/list_python?city={city}&cl=false&fromSearch=true&labelWords=&suginput=".format(city = self.city)
		html = requests.get(url,headers = self.headers).content.decode('utf-8')
		soup = BeautifulSoup(html,'lxml')
		res = soup.find_all('div',class_='content')

		for tag in res[0].find_all('a'):
			tag.string != "不限" and self.district.setdefault(tag.string,[])

爬取商业区信息

	def get_business(self):
		for i in self.district.keys():
			url = 'https://www.lagou.com/jobs/list_python?px=default&city={city}&district={district}#filterBox'.format(city=self.city,district=dis)
			html = requests.get(url, headers = self.headers).content.decode('utf-8')
			soup = BeautifulSoup(html,'lxml')
			res = soup.find_all('li',class_='detail-bizArea-area')
			
			for tag in res[0].find_all('a'):
				tag.string != "不限" and self.district[i].apeend(tag.string)
			print(self.district[i])

爬取商业区职位信息

	def crawl_positions(self, district, biz_area):
        url = "https://www.lagou.com/jobs/positionAjax.json?px=new&city="+self.city+"&district="+district+"&bizArea="+biz_area+"&needAddtionalResult=false"
        referer_url = "https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?px=new&city="+quote(self.city)+"&district="+quote(district)+"&bizArea="+quote(biz_area)
        self.headers['User-Agent'] = self.browsers[random.randint(0,5)]
        self.headers['Referer'] = referer_url
        
        page = 1
        flag = 'true'
        result = queue.Queue()
        
        while True:
        	if page != 1:
        		flag = 'false'
        	data = {
        		'first':flag,
        		'pn':page,
        		'kd':'python'
        		}
        		res = self.analysis_data(url,data)
            if not res['content']['positionResult']['result']:
                break

            for i in res['content']['positionResult']['result']:
                data = district, biz_area, i['createTime'],\
                       i['companyShortName'],i['companySize'],i['industryField'], \
                       i['positionName'], i['firstType'], i['secondType'],\
                       i['salary'], i['workYear'], i['education'], i['positionId']
                result.put(data)
            page = page + 1

		return result


使用代理ip从url中获取数据

	def get_data(self, url, data)
		try:
			response = requests.get('http://localhost:5555/random')
			proxy = {'http':'http://'+response.text}
			res = requests.post(url, headers = self.headers, proxies=proxy, data=data, timeout=5)
		except:
			print('代理失效,重新获取请求')
			res = self.get_data(url,data)
			
		return res

分析数据是否有key error

	def analysis_data(self,url,data):
		res = self.get_data(url,data)
		res = json.loads(res.content.decode('utf-8'))
		
		try:
			if not res['content']['positionResult']['result']:
				pass
			except KeyError:
				print('Keyerror,继续请求')
				res = self.analysis_data(url, data)

			return res				

爬取职位信息主程序

	def crawl(self):
		for i in self.district.key():
			for biz_area in self.district[i]:
				res_queue = self.crawl_positions(i,biz_area)
				print('%s %s的职位信息爬取完毕:%d' %(i, biz_area,res_queue.qsize()))
				
				while not res_queue.empty():
					data = res_queue.get()
					row = self.db.insert_data(data)
					if not row:
						print('插入数据失败')

爬取部分运行

	def run(self):
		self.get_administrative()
		self.get_business()
		self.crawl
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值