爬虫-练习（五）爬取lagou网的职位信息到Mysql数据库

最新推荐文章于 2020-04-07 11:52:45 发布

匿名啊啊啊

最新推荐文章于 2020-04-07 11:52:45 发布

阅读量463

点赞数

分类专栏：爬虫练习文章标签：爬虫

本文链接：https://blog.csdn.net/qq_41851454/article/details/79808790

版权

爬虫练习专栏收录该内容

5 篇文章 0 订阅

订阅专栏

网址：

https://www.lagou.com/

在Mysql里面建立一个要爬取数据的表：

数据库建表语句：

create database if not exists spider;
use spider;
CREATE TABLE `lagou` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `t_job` varchar(255) DEFAULT NULL,
  `t_addr` varchar(255) DEFAULT NULL,
  `t_tag` varchar(255) DEFAULT NULL,
  `t_com` varchar(255) DEFAULT NULL,
  `t_money` varchar(255) DEFAULT NULL,
  `t_edu` varchar(255) DEFAULT NULL,
  `t_exp` varchar(255) DEFAULT NULL,
  `t_time` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
show tables;
select * from lagou;

程序代码：

#! usr/bin/python3
#! -*- coding: utf-8 -*-

from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import time
import pymysql

head = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
                "Cookie":"JSESSIONID=ABAAABAAAIAACBI2D7B8FFA2C068F071646B86911EFDF59; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1522318478; _ga=GA1.2.1811029983.1522318478; _gat=1; user_trace_token=20180329181720-5da621d4-333a-11e8-a3c4-525400f775ce; LGSID=20180329181720-5da622d0-333a-11e8-a3c4-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20180329181720-5da6248b-333a-11e8-a3c4-525400f775ce; _gid=GA1.2.1779902732.1522318478; index_location_city=%E5%85%A8%E5%9B%BD; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1522318487; LGRID=20180329181729-632f2563-333a-11e8-a3c4-525400f775ce; TG-TRACK-CODE=index_navigation"}


con = pymysql.connect(host="potter2",user="root",password="root",database="test", charset='utf8', port=3306)

cursor = con.cursor()
# lagou_insert_sql = "insert into lagou (_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1) VALUES (%s, %s, %s, %s, %s, %s, %s,%s,%s,%s,%s)"
# _title,_loc,_salary,_exp,_edu,_tags,_company,_domain,_stage,_adv,time1

"""

数据库建表语句：

create database if not exists spider;
use spider;
CREATE TABLE `lagou` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `t_job` varchar(255) DEFAULT NULL,
  `t_addr` varchar(255) DEFAULT NULL,
  `t_tag` varchar(255) DEFAULT NULL,
  `t_com` varchar(255) DEFAULT NULL,
  `t_money` varchar(255) DEFAULT NULL,
  `t_edu` varchar(255) DEFAULT NULL,
  `t_exp` varchar(255) DEFAULT NULL,
  `t_time` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
show tables;
select * from lagou;


"""


def get_all_url():

    url = "https://www.lagou.com/"
    request1 = Request(url,headers=head)
    response = urlopen(request1)
    bs = BeautifulSoup(response,"html.parser")
    lagou_list = bs.select("div.mainNavs a")
    url_list = [link.get("href") for link in lagou_list]
    url_list = set(url_list)
    print(url_list)
    return url_list
url_list = get_all_url()
print(url_list)
print(url_list.__len__())

def crawl(link):
    for page in range(1,31):
        url = link + str(page) + "/"
        print("即将抓取第%d页数据，url为：%s"%(page,url))
        request = Request(url,headers=head)
        response = urlopen(request)
        # 如果请求的url不等于服务器回应的url，说明本页已经没有数据，返回，
        # 爬取下一个招聘岗位的链接。
        if url != response.geturl():
            print("第%d页没有数据，继续抓取下一个链接。"%page)
            return
        soup = BeautifulSoup(response,"html.parser")
        #职位
        title = soup.select("a.position_link h3")
        # print(title)
        #地址
        loc = soup.select("span.add em")
        # print(loc)
        #经验跟学历
        see = soup.select("div.p_bot > div.li_b_l")
        # print(see)
        #公司介绍
        tags = soup.select("div.list_item_bot > div.li_b_l")
        # print(tags)
        #公司名称
        company = soup.select("div.company_name > a")
        # print(company)
        #公司性质
        d_s = soup.select("div.industry")
        # print(d_s)
        #公司福利
        adv = soup.select("div.li_b_r")
        # print(adv)
        #发布时间
        times = soup.select("div.list_item_top > div.position > div.p_top > span")
        # print(times)

        for i in range(len(title)):
            _title = title[i].text
            # print(_title)
            _loc = loc[i].text
            temp = list(see[i].stripped_strings)
            _salary = temp[0]
            # split 对参数指定的内容进行切割，返回切割之后的列表。
            temp = temp[1].split(" / ")
            _exp = temp[0]
            _edu = temp[1]
            _tags = ",".join(tags[i].stripped_strings)
            _company = company[i].text
            temp = d_s[i].text.strip()
            temp = temp.split(" / ")

            if len(temp) == 2:
                _domain1 = temp[0]
                _stage = temp[1]
            else:
                _domain1 = "无"
                _stage = "无"
            _adv = adv[i].text.strip("“”")
            time1 = times[i].text
                # print('---------------------')
            print([_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1])
        # "insert into lagou (t_job, t_addr, t_com, t_tag, t_money, t_edu, t_exp,t_time) VALUES (%s, %s, %s, %s, %s, %s, %s,%s)"
            lagou_insert_sql = "insert into lagou (_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1) VALUES (%s, %s, %s, %s, %s, %s, %s,%s,%s,%s,%s)"
            record = [_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1]
        #     record = ["a","a","a","a","a","a","a","a","a","a","a"]
            cursor.execute(lagou_insert_sql, record)
        time.sleep(3)

        con.commit()
for job_link in url_list:
    crawl(job_link)

# title = crawl("https://www.lagou.com/zhaopin/")
# print(title)

cursor.close()
con.close()

匿名啊啊啊

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬虫-练习（五）爬取lagou网的职位信息到Mysql数据库

网址：https://www.lagou.com/在Mysql里面建立一个要爬取数据的表：数据库建表语句：create database if not exists spider;use spider;CREATE TABLE `lagou` ( `id` int(11) NOT NULL AUTO_INCREMENT, `t_job` varchar(255) DEFAULT N...
复制链接

扫一扫