Python爬取51jobs结尾之MySQL(4)

最新推荐文章于 2024-07-09 14:45:02 发布

就问一下

最新推荐文章于 2024-07-09 14:45:02 发布

阅读量245

点赞数 1

分类专栏： Python进阶学习文章标签： Python MySQL

本文链接：https://blog.csdn.net/qq_41472529/article/details/87355979

版权

Python进阶学习专栏收录该内容

5 篇文章 0 订阅

订阅专栏

将前文数据放至MySQL数据库中

class Data_Transport(object):
    def __init__(self):
        """
        连接MySQL的qianc_job数据库
        """
        # 未加上charset='utf8'  出现报错latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
        self.tbd = pymysql.connect('localhost', 'root', 'wjh940517', 'qianc_job', charset='utf8')  # 填写自己数据库的信息（IP,账号，密码，库名称）
        self.t_cousor = self.tbd.cursor()

    def create_table(self):
        """
        创建一个51job表在MySQL中
        """
        sql = "create table 51job(id int auto_increment primary key," \
              "sname varchar(200) not null," \
              "job_datas varchar(2000) not null," \
              "job_salary varchar(100) not null)"
        self.t_cousor.execute(sql)
        self.tbd.commit()

    def data_save(self,sname,job_data,job_salary):
        """
        将清洗数据放至MySQL的51job表中
        """
        sql = """INSERT INTO 51job ( sname, job_datas,job_salary) VALUES("%s", "%s", "%s")""" %(sname,pymysql.escape_string(job_data),job_salary)
        self.t_cousor.execute(sql)
        self.tbd.commit()

    def tear_down(self):
        """
        断开MySQL连接
        """
        self.t_cousor.close()
        self.tbd.close()

值得一说的是在data_save()方法中：
sql = """INSERT INTO 51job ( sname, job_datas,job_salary) VALUES("%s", "%s", "%s")""" %(sname,pymysql.escape_string(job_data),job_salary)
这与我们在Python中的语法有点不一样，%s还加了双引号。

另外，# 将pymysql.escape_string(job_data) 改为 job_data 会引发报错：pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax。

Python爬取51jobs整段代码：

from selenium import webdriver
import time
import requests
import re
from bs4 import BeautifulSoup
import pymysql


class FindJobs(object):
    
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.driver.maximize_window()
        self.driver.implicitly_wait(2)
        self.url = 'https://mkt.51job.com/tg/sem/pz_2018.html?from=baidupz'
        self.driver.get(self.url)
        self.webpages_list = []
 
    def find_position(self):
        self.driver.find_element_by_xpath("//*[@id='kwdselectid']").send_keys("软件测试工程师")  # 职位
        # 选择城市
        self.driver.find_element_by_xpath("//*[@id='work_position_input']").click()
        js = "var x = document.getElementById('work_position_click_center_left_each_000000');" \
             "x.style.color='red';" \
             "x.className='';" \
             "y = document.getElementById('work_position_click_center_left_each_220200');" \
             "y.className='on'"
        self.driver.execute_script(js)
        time.sleep(0.5)
        self.driver.find_element_by_xpath("//*[@id='work_position_click_center_left_each_220200']").click()
        self.driver.find_element_by_xpath("//*[@id='work_position_click_center_right_list_category_220200_080200']").click()
        self.driver.find_element_by_xpath("//*[@id='work_position_click_bottom_save']").click()
        # 点击查询
        self.driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div/div/button").click()    # 点击查询按钮
    
    def find_position_2(self):
        self.driver.find_element_by_xpath("/html/body/div[2]/div[1]/div[16]/span").click()
        self.driver.find_element_by_xpath("//*[@id='filter_providesalary']/ul/li[7]").click()   # 月薪范围
        self.driver.find_element_by_xpath("//*[@id='filter_workyear']/ul/li[3]/a").click()      # 工作年限
        self.driver.find_element_by_xpath("//*[@id='filter_degreefrom']/ul/li[5]/a").click()    # 学历要求

    def get_webpage(self):
        webpages = self.driver.find_elements_by_xpath("//*[@id='resultList']/div/p/span/a")
        for webpage in webpages:
            webpage = webpage.get_attribute("href")
            self.webpages_list.append(webpage)
        print(self.webpages_list)
        self.driver.close()

    @staticmethod
    def data_cleaning(url):
        user_agent = 'Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)'
        headers = {
            'User-Agent': user_agent
        }
        #url = "https://jobs.51job.com/hangzhou-jgq/104504900.html?s=01&t=0"
        r = requests.get(url, headers)
        print(r.text)
        soup = BeautifulSoup(r.text, 'html.parser', exclude_encodings="utf-8")
        
        # 公司名称
        sname = soup.find_all(class_='catn')[0]['title']
        # 职位信息
        directory = soup.find_all(class_='bmsg job_msg inbox')[0]  # 返回一个<class 'bs4.element.Tag'>
        # TypeError: 'NoneType' object is not callable
        job_datas = str(directory).replace("\n", "")
        pattern = re.compile('<div class="bmsg job_msg inbox">(.*?)<div', re.S)
        job_data = re.findall(pattern, job_datas)
        job_data = job_data[0].replace('<p>', '').replace('</p>','\n')
        # 月薪
        job_salary = soup.find_all(class_='cn')[0].strong.text
        return  sname,job_data,job_salary
        
class Data_Transport(object):
    def __init__(self):
        # 'latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
        self.tbd = pymysql.connect('localhost', 'root', 'wjh940517', 'qianc_job', charset='utf8')
        self.t_cousor = self.tbd.cursor()

    def create_table(self):
        sql = "create table 51job(id int auto_increment primary key," \
              "sname varchar(200) not null," \
              "job_datas varchar(2000) not null," \
              "job_salary varchar(100) not null)"
        self.t_cousor.execute(sql)
        self.tbd.commit()

    def data_save(self,sname,job_data,job_salary):
        # pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax
        sql = """INSERT INTO 51job ( sname, job_datas,job_salary) VALUES("%s", "%s", "%s")""" %(sname,pymysql.escape_string(job_data),job_salary)
        self.t_cousor.execute(sql)
        self.tbd.commit()

    def tear_down(self):
        self.t_cousor.close()
        self.tbd.close()


if __name__ == "__main__":
    find_jobs = FindJobs()
    find_jobs.find_position()
    find_jobs.find_position_2()
    find_jobs.get_webpage()
    data_transport = Data_Transport()
    data_transport.create_table()
    for url in find_jobs.webpages_list:
        try:
            sname, job_data, job_salary = find_jobs.data_cleaning(url)
            data_transport.data_save(sname, job_data, job_salary)
        except IndexError:
            pass
    data_transport.tear_down()

对于最后这一段的解释：
for url in find_jobs.webpages_list:
    try:
        sname, job_data, job_salary = find_jobs.data_cleaning(url)
        data_transport.data_save(sname, job_data, job_salary)
    except IndexError:
        pass
因为，并不是每个公司招聘都用前程无忧的网站模板，有些公司招聘会跳转到其公司官网，导致数据清洗报错。

所以，嗯，你懂的，我跳过去了

Python爬取51jobs整个就结束了，对面读取数据库内容，你也试着用djangoi搭建网站观看。

什么？不会搭建django.点击这里参考django学习

当然，上面代码还是很多优化的地方，欢迎大家指正。