爬虫项目——拉勾上坐标北京的18804条python职位

最新推荐文章于 2024-07-07 22:25:52 发布

Kexin_Du

最新推荐文章于 2024-07-07 22:25:52 发布

阅读量228

点赞数

文章标签：爬虫

本文链接：https://blog.csdn.net/weixin_42836351/article/details/82775447

版权

爬取整个北京地区的python职位存储为sqlite文件

最终爬取结果截图
创建数据库文件
爬取拉勾

最终爬取结果截图

共爬取数据18804条
在这里插入图片描述

创建数据库文件

class Crawldb:
	def __init__(self):
			self.db_file = 'lago_python.db'
			self.table = 'lago'
			
			self.create_db_file()
			self.conn = sqlite3.connect('lago_python.db')
			self.create_table

创建数据库文件

	def create_db_file(self):
		if not os.path.exits(self.db_file):
			f = open(self.db.db_file,'w+')
			f.close()
		return True

创建表

	def create_table(self):
		cursor = self.conn.cursor()
		sql = "DROP TABLE IF EXISTS'{table}'".format(table=self.table)
		cursor.execute(sql)

		sql = "create table %s (id integer primary key not null, " \
              "district varchar(25) not null, biz_area varchar(25) not null, createTime varchar(25) not null, " \
              "companyShortName varchar(25) not null,companySize varchar(25) not null, industryField varchar(25) not null," \
              "positionName varchar(25) not null, firstType varchar(25) not null, secondType varchar(25) not null,  " \
              "salary varchar(25) not null, workYear varchar(25) not null,education varchar(25) not null, positionId varchar(25) not null)" % self.table
              cursor.execute(sql)
              cursor.close()
              self.conn.commit()     
              
		return True

插入数据

	def insert_data(self, qrgs):
		sql = "insert into `lago_` (`district`, `biz_area`, `createTime`," \
              "`companyShortName`,`companySize`,`industryField`," \
              " `positionName`, `firstType`,`secondType`," \
              " `salary`, `workYear`, `education`, `positionId`) values "
        sql += "('%s', '%s','%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % args
        
        row = self.conn.cursor().execute(sql).rowcount
        self.conn.cursor().close()
        self.conn.commit()
        
        return row

爬取拉勾

class Crawl:
	def __init__(self):
		self.city = '北京'
		self.db = Crawldb()
		self.district = dict()

		
		self.headers=[
		     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
		]
		self.browesers = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ",
            "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) ",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ",
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
        ]

爬取行政区信息

	def get_administrative(self):
		url = "https://www.lagou.com/jobs/list_python?city={city}&cl=false&fromSearch=true&labelWords=&suginput=".format(city = self.city)
		html = requests.get(url,headers = self.headers).content.decode('utf-8')
		soup = BeautifulSoup(html,'lxml')
		res = soup.find_all('div',class_='content')

		for tag in res[0].find_all('a'):
			tag.string != "不限" and self.district.setdefault(tag.string,[])

爬取商业区信息

	def get_business(self):
		for i in self.district.keys():
			url = 'https://www.lagou.com/jobs/list_python?px=default&city={city}&district={district}#filterBox'.format(city=self.city,district=dis)
			html = requests.get(url, headers = self.headers).content.decode('utf-8')
			soup = BeautifulSoup(html,'lxml')
			res = soup.find_all('li',class_='detail-bizArea-area')
			
			for tag in res[0].find_all('a'):
				tag.string != "不限" and self.district[i].apeend(tag.string)
			print(self.district[i])

爬取商业区职位信息

	def crawl_positions(self, district, biz_area):
        url = "https://www.lagou.com/jobs/positionAjax.json?px=new&city="+self.city+"&district="+district+"&bizArea="+biz_area+"&needAddtionalResult=false"
        referer_url = "https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?px=new&city="+quote(self.city)+"&district="+quote(district)+"&bizArea="+quote(biz_area)
        self.headers['User-Agent'] = self.browsers[random.randint(0,5)]
        self.headers['Referer'] = referer_url
        
        page = 1
        flag = 'true'
        result = queue.Queue()
        
        while True:
        	if page != 1:
        		flag = 'false'
        	data = {
        		'first':flag,
        		'pn':page,
        		'kd':'python'
        		}
        		res = self.analysis_data(url,data)
            if not res['content']['positionResult']['result']:
                break

            for i in res['content']['positionResult']['result']:
                data = district, biz_area, i['createTime'],\
                       i['companyShortName'],i['companySize'],i['industryField'], \
                       i['positionName'], i['firstType'], i['secondType'],\
                       i['salary'], i['workYear'], i['education'], i['positionId']
                result.put(data)
            page = page + 1

		return result

使用代理ip从url中获取数据

	def get_data(self, url, data)
		try:
			response = requests.get('http://localhost:5555/random')
			proxy = {'http':'http://'+response.text}
			res = requests.post(url, headers = self.headers, proxies=proxy, data=data, timeout=5)
		except:
			print('代理失效，重新获取请求')
			res = self.get_data(url,data)
			
		return res

分析数据是否有key error

	def analysis_data(self,url,data):
		res = self.get_data(url,data)
		res = json.loads(res.content.decode('utf-8'))
		
		try:
			if not res['content']['positionResult']['result']:
				pass
			except KeyError:
				print('Keyerror，继续请求')
				res = self.analysis_data(url, data)

			return res

爬取职位信息主程序

	def crawl(self):
		for i in self.district.key():
			for biz_area in self.district[i]:
				res_queue = self.crawl_positions(i,biz_area)
				print('%s %s的职位信息爬取完毕：%d' %(i, biz_area,res_queue.qsize()))
				
				while not res_queue.empty():
					data = res_queue.get()
					row = self.db.insert_data(data)
					if not row:
						print('插入数据失败')

爬取部分运行

	def run(self):
		self.get_administrative()
		self.get_business()
		self.crawl

Kexin_Du

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬虫项目——拉勾上坐标北京的18804条python职位

不使用框架爬取拉勾
复制链接

扫一扫

爬虫项目——拉勾上坐标北京的18804条python职位

爬取整个北京地区的python职位存储为sqlite文件

最终爬取结果截图

创建数据库文件

创建数据库文件

创建表

插入数据

爬取拉勾

爬取行政区信息

爬取商业区信息

爬取商业区职位信息

使用代理ip从url中获取数据

分析数据是否有key error

爬取职位信息主程序

爬取部分运行

“相关推荐”对你有帮助么？