Python爬取51jobs结尾之MySQL(4)

将前文数据放至MySQL数据库中

class Data_Transport(object):
    def __init__(self):
        """
        连接MySQL的qianc_job数据库
        """
        # 未加上charset='utf8'  出现报错latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
        self.tbd = pymysql.connect('localhost', 'root', 'wjh940517', 'qianc_job', charset='utf8')  # 填写自己数据库的信息(IP,账号,密码,库名称)
        self.t_cousor = self.tbd.cursor()

    def create_table(self):
        """
        创建一个51job表在MySQL中
        """
        sql = "create table 51job(id int auto_increment primary key," \
              "sname varchar(200) not null," \
              "job_datas varchar(2000) not null," \
              "job_salary varchar(100) not null)"
        self.t_cousor.execute(sql)
        self.tbd.commit()

    def data_save(self,sname,job_data,job_salary):
        """
        将清洗数据放至MySQL的51job表中
        """
        sql = """INSERT INTO 51job ( sname, job_datas,job_salary) VALUES("%s", "%s", "%s")""" %(sname,pymysql.escape_string(job_data),job_salary)
        self.t_cousor.execute(sql)
        self.tbd.commit()

    def tear_down(self):
        """
        断开MySQL连接
        """
        self.t_cousor.close()
        self.tbd.close()

值得一说的是在data_save()方法中:

sql = """INSERT INTO 51job ( sname, job_datas,job_salary) VALUES("%s", "%s", "%s")""" %(sname,pymysql.escape_string(job_data),job_salary)

这与我们在Python中的语法有点不一样,%s还加了双引号。

另外,# 将pymysql.escape_string(job_data) 改为 job_data  会引发报错:pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax。

Python爬取51jobs整段代码:

from selenium import webdriver
import time
import requests
import re
from bs4 import BeautifulSoup
import pymysql


class FindJobs(object):
    
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.driver.maximize_window()
        self.driver.implicitly_wait(2)
        self.url = 'https://mkt.51job.com/tg/sem/pz_2018.html?from=baidupz'
        self.driver.get(self.url)
        self.webpages_list = []
 
    def find_position(self):
        self.driver.find_element_by_xpath("//*[@id='kwdselectid']").send_keys("软件测试工程师")  # 职位
        # 选择城市
        self.driver.find_element_by_xpath("//*[@id='work_position_input']").click()
        js = "var x = document.getElementById('work_position_click_center_left_each_000000');" \
             "x.style.color='red';" \
             "x.className='';" \
             "y = document.getElementById('work_position_click_center_left_each_220200');" \
             "y.className='on'"
        self.driver.execute_script(js)
        time.sleep(0.5)
        self.driver.find_element_by_xpath("//*[@id='work_position_click_center_left_each_220200']").click()
        self.driver.find_element_by_xpath("//*[@id='work_position_click_center_right_list_category_220200_080200']").click()
        self.driver.find_element_by_xpath("//*[@id='work_position_click_bottom_save']").click()
        # 点击查询
        self.driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div/div/button").click()    # 点击查询按钮
    
    def find_position_2(self):
        self.driver.find_element_by_xpath("/html/body/div[2]/div[1]/div[16]/span").click()
        self.driver.find_element_by_xpath("//*[@id='filter_providesalary']/ul/li[7]").click()   # 月薪范围
        self.driver.find_element_by_xpath("//*[@id='filter_workyear']/ul/li[3]/a").click()      # 工作年限
        self.driver.find_element_by_xpath("//*[@id='filter_degreefrom']/ul/li[5]/a").click()    # 学历要求

    def get_webpage(self):
        webpages = self.driver.find_elements_by_xpath("//*[@id='resultList']/div/p/span/a")
        for webpage in webpages:
            webpage = webpage.get_attribute("href")
            self.webpages_list.append(webpage)
        print(self.webpages_list)
        self.driver.close()

    @staticmethod
    def data_cleaning(url):
        user_agent = 'Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)'
        headers = {
            'User-Agent': user_agent
        }
        #url = "https://jobs.51job.com/hangzhou-jgq/104504900.html?s=01&t=0"
        r = requests.get(url, headers)
        print(r.text)
        soup = BeautifulSoup(r.text, 'html.parser', exclude_encodings="utf-8")
        
        # 公司名称
        sname = soup.find_all(class_='catn')[0]['title']
        # 职位信息
        directory = soup.find_all(class_='bmsg job_msg inbox')[0]  # 返回一个<class 'bs4.element.Tag'>
        # TypeError: 'NoneType' object is not callable
        job_datas = str(directory).replace("\n", "")
        pattern = re.compile('<div class="bmsg job_msg inbox">(.*?)<div', re.S)
        job_data = re.findall(pattern, job_datas)
        job_data = job_data[0].replace('<p>', '').replace('</p>','\n')
        # 月薪
        job_salary = soup.find_all(class_='cn')[0].strong.text
        return  sname,job_data,job_salary
        
class Data_Transport(object):
    def __init__(self):
        # 'latin-1' codec can't encode characters in position 61-72: ordinal not in range(256)
        self.tbd = pymysql.connect('localhost', 'root', 'wjh940517', 'qianc_job', charset='utf8')
        self.t_cousor = self.tbd.cursor()

    def create_table(self):
        sql = "create table 51job(id int auto_increment primary key," \
              "sname varchar(200) not null," \
              "job_datas varchar(2000) not null," \
              "job_salary varchar(100) not null)"
        self.t_cousor.execute(sql)
        self.tbd.commit()

    def data_save(self,sname,job_data,job_salary):
        # pymysql.err.ProgrammingError: (1064, 'You have an error in your SQL syntax
        sql = """INSERT INTO 51job ( sname, job_datas,job_salary) VALUES("%s", "%s", "%s")""" %(sname,pymysql.escape_string(job_data),job_salary)
        self.t_cousor.execute(sql)
        self.tbd.commit()

    def tear_down(self):
        self.t_cousor.close()
        self.tbd.close()


if __name__ == "__main__":
    find_jobs = FindJobs()
    find_jobs.find_position()
    find_jobs.find_position_2()
    find_jobs.get_webpage()
    data_transport = Data_Transport()
    data_transport.create_table()
    for url in find_jobs.webpages_list:
        try:
            sname, job_data, job_salary = find_jobs.data_cleaning(url)
            data_transport.data_save(sname, job_data, job_salary)
        except IndexError:
            pass
    data_transport.tear_down()

对于最后这一段的解释:

for url in find_jobs.webpages_list:
    try:
        sname, job_data, job_salary = find_jobs.data_cleaning(url)
        data_transport.data_save(sname, job_data, job_salary)
    except IndexError:
        pass

因为,并不是每个公司招聘都用前程无忧的网站模板,有些公司招聘会跳转到其公司官网,导致数据清洗报错。

所以,嗯,你懂的,我跳过去了

Python爬取51jobs整个就结束了,对面读取数据库内容,你也试着用djangoi搭建网站观看。

什么?不会搭建django.点击这里参考django学习

当然,上面代码还是很多优化的地方,欢迎大家指正。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值