将前文数据放至MySQL数据库中
class Data_Transport(object):
def __init__(self):
"""
连接MySQL的qianc_job数据库
"""
# 未加上charset='utf8' 出现报错latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
self.tbd = pymysql.connect('localhost', 'root', 'wjh940517', 'qianc_job', charset='utf8') # 填写自己数据库的信息(IP,账号,密码,库名称)
self.t_cousor = self.tbd.cursor()
def create_table(self):
"""
创建一个51job表在MySQL中
"""
sql = "create table 51job(id int auto_increment primary key," \
"sname varchar(200) not null," \
"job_datas varchar(2000) not null," \
"job_salary varchar(100) not null)"
self.t_cousor.execute(sql)
self.tbd.commit()
def data_save(self,sname,job_data,job_salary):
"""
将清洗数据放至MySQL的51job表中
"""
sql = """INSERT INTO 51job ( sname, job_datas,job_salary) VALUES("%s", "%s", "%s")""" %(sname,pymysql.escape_string(job_data),job_salary)
self.t_cousor.execute(sql)
self.tbd.commit()
def tear_down(self):
"""
断开MySQL连接
"""
self.t_cousor.close()
self.tbd.close()
值得一说的是在data_save()方法中:
sql = """INSERT INTO 51job ( sname, job_datas,job_salary) VALUES("%s", "%s", "%s")""" %(sname,pymysql.escape_string(job_data),job_salary)
这与我们在Python中的语法有点不一样,%s还加了双引号。
另外,# 将pymysql.escape_string(job_data) 改为 job_data 会引发报错:pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax。
Python爬取51jobs整段代码:
from selenium import webdriver
import time
import requests
import re
from bs4 import BeautifulSoup
import pymysql
class FindJobs(object):
def __init__(self):
self.driver = webdriver.Chrome()
self.driver.maximize_window()
self.driver.implicitly_wait(2)
self.url = 'https://mkt.51job.com/tg/sem/pz_2018.html?from=baidupz'
self.driver.get(self.url)
self.webpages_list = []
def find_position(self):
self.driver.find_element_by_xpath("//*[@id='kwdselectid']").send_keys("软件测试工程师") # 职位
# 选择城市
self.driver.find_element_by_xpath("//*[@id='work_position_input']").click()
js = "var x = document.getElementById('work_position_click_center_left_each_000000');" \
"x.style.color='red';" \
"x.className='';" \
"y = document.getElementById('work_position_click_center_left_each_220200');" \
"y.className='on'"
self.driver.execute_script(js)
time.sleep(0.5)
self.driver.find_element_by_xpath("//*[@id='work_position_click_center_left_each_220200']").click()
self.driver.find_element_by_xpath("//*[@id='work_position_click_center_right_list_category_220200_080200']").click()
self.driver.find_element_by_xpath("//*[@id='work_position_click_bottom_save']").click()
# 点击查询
self.driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div/div/button").click() # 点击查询按钮
def find_position_2(self):
self.driver.find_element_by_xpath("/html/body/div[2]/div[1]/div[16]/span").click()
self.driver.find_element_by_xpath("//*[@id='filter_providesalary']/ul/li[7]").click() # 月薪范围
self.driver.find_element_by_xpath("//*[@id='filter_workyear']/ul/li[3]/a").click() # 工作年限
self.driver.find_element_by_xpath("//*[@id='filter_degreefrom']/ul/li[5]/a").click() # 学历要求
def get_webpage(self):
webpages = self.driver.find_elements_by_xpath("//*[@id='resultList']/div/p/span/a")
for webpage in webpages:
webpage = webpage.get_attribute("href")
self.webpages_list.append(webpage)
print(self.webpages_list)
self.driver.close()
@staticmethod
def data_cleaning(url):
user_agent = 'Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)'
headers = {
'User-Agent': user_agent
}
#url = "https://jobs.51job.com/hangzhou-jgq/104504900.html?s=01&t=0"
r = requests.get(url, headers)
print(r.text)
soup = BeautifulSoup(r.text, 'html.parser', exclude_encodings="utf-8")
# 公司名称
sname = soup.find_all(class_='catn')[0]['title']
# 职位信息
directory = soup.find_all(class_='bmsg job_msg inbox')[0] # 返回一个<class 'bs4.element.Tag'>
# TypeError: 'NoneType' object is not callable
job_datas = str(directory).replace("\n", "")
pattern = re.compile('<div class="bmsg job_msg inbox">(.*?)<div', re.S)
job_data = re.findall(pattern, job_datas)
job_data = job_data[0].replace('<p>', '').replace('</p>','\n')
# 月薪
job_salary = soup.find_all(class_='cn')[0].strong.text
return sname,job_data,job_salary
class Data_Transport(object):
def __init__(self):
# 'latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
self.tbd = pymysql.connect('localhost', 'root', 'wjh940517', 'qianc_job', charset='utf8')
self.t_cousor = self.tbd.cursor()
def create_table(self):
sql = "create table 51job(id int auto_increment primary key," \
"sname varchar(200) not null," \
"job_datas varchar(2000) not null," \
"job_salary varchar(100) not null)"
self.t_cousor.execute(sql)
self.tbd.commit()
def data_save(self,sname,job_data,job_salary):
# pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax
sql = """INSERT INTO 51job ( sname, job_datas,job_salary) VALUES("%s", "%s", "%s")""" %(sname,pymysql.escape_string(job_data),job_salary)
self.t_cousor.execute(sql)
self.tbd.commit()
def tear_down(self):
self.t_cousor.close()
self.tbd.close()
if __name__ == "__main__":
find_jobs = FindJobs()
find_jobs.find_position()
find_jobs.find_position_2()
find_jobs.get_webpage()
data_transport = Data_Transport()
data_transport.create_table()
for url in find_jobs.webpages_list:
try:
sname, job_data, job_salary = find_jobs.data_cleaning(url)
data_transport.data_save(sname, job_data, job_salary)
except IndexError:
pass
data_transport.tear_down()
对于最后这一段的解释:
for url in find_jobs.webpages_list: try: sname, job_data, job_salary = find_jobs.data_cleaning(url) data_transport.data_save(sname, job_data, job_salary) except IndexError: pass
因为,并不是每个公司招聘都用前程无忧的网站模板,有些公司招聘会跳转到其公司官网,导致数据清洗报错。
所以,嗯,你懂的,我跳过去了
Python爬取51jobs整个就结束了,对面读取数据库内容,你也试着用djangoi搭建网站观看。
什么?不会搭建django.点击这里参考django学习
当然,上面代码还是很多优化的地方,欢迎大家指正。