爬虫-练习(五)爬取lagou网的职位信息到Mysql数据库

网址:

https://www.lagou.com/

在Mysql里面建立一个要爬取数据的表:

数据库建表语句:

create database if not exists spider;
use spider;
CREATE TABLE `lagou` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `t_job` varchar(255) DEFAULT NULL,
  `t_addr` varchar(255) DEFAULT NULL,
  `t_tag` varchar(255) DEFAULT NULL,
  `t_com` varchar(255) DEFAULT NULL,
  `t_money` varchar(255) DEFAULT NULL,
  `t_edu` varchar(255) DEFAULT NULL,
  `t_exp` varchar(255) DEFAULT NULL,
  `t_time` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
show tables;
select * from lagou;

程序代码:

#! usr/bin/python3
#! -*- coding: utf-8 -*-

from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import time
import pymysql

head = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
                "Cookie":"JSESSIONID=ABAAABAAAIAACBI2D7B8FFA2C068F071646B86911EFDF59; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1522318478; _ga=GA1.2.1811029983.1522318478; _gat=1; user_trace_token=20180329181720-5da621d4-333a-11e8-a3c4-525400f775ce; LGSID=20180329181720-5da622d0-333a-11e8-a3c4-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20180329181720-5da6248b-333a-11e8-a3c4-525400f775ce; _gid=GA1.2.1779902732.1522318478; index_location_city=%E5%85%A8%E5%9B%BD; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1522318487; LGRID=20180329181729-632f2563-333a-11e8-a3c4-525400f775ce; TG-TRACK-CODE=index_navigation"}


con = pymysql.connect(host="potter2",user="root",password="root",database="test", charset='utf8', port=3306)

cursor = con.cursor()
# lagou_insert_sql = "insert into lagou (_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1) VALUES (%s, %s, %s, %s, %s, %s, %s,%s,%s,%s,%s)"
# _title,_loc,_salary,_exp,_edu,_tags,_company,_domain,_stage,_adv,time1

"""

数据库建表语句:

create database if not exists spider;
use spider;
CREATE TABLE `lagou` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `t_job` varchar(255) DEFAULT NULL,
  `t_addr` varchar(255) DEFAULT NULL,
  `t_tag` varchar(255) DEFAULT NULL,
  `t_com` varchar(255) DEFAULT NULL,
  `t_money` varchar(255) DEFAULT NULL,
  `t_edu` varchar(255) DEFAULT NULL,
  `t_exp` varchar(255) DEFAULT NULL,
  `t_time` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
show tables;
select * from lagou;


"""


def get_all_url():

    url = "https://www.lagou.com/"
    request1 = Request(url,headers=head)
    response = urlopen(request1)
    bs = BeautifulSoup(response,"html.parser")
    lagou_list = bs.select("div.mainNavs a")
    url_list = [link.get("href") for link in lagou_list]
    url_list = set(url_list)
    print(url_list)
    return url_list
url_list = get_all_url()
print(url_list)
print(url_list.__len__())

def crawl(link):
    for page in range(1,31):
        url = link + str(page) + "/"
        print("即将抓取第%d页数据,url为:%s"%(page,url))
        request = Request(url,headers=head)
        response = urlopen(request)
        # 如果请求的url不等于服务器回应的url,说明本页已经没有数据,返回,
        # 爬取下一个招聘岗位的链接。
        if url != response.geturl():
            print("第%d页没有数据,继续抓取下一个链接。"%page)
            return
        soup = BeautifulSoup(response,"html.parser")
        #职位
        title = soup.select("a.position_link h3")
        # print(title)
        #地址
        loc = soup.select("span.add em")
        # print(loc)
        #经验跟学历
        see = soup.select("div.p_bot > div.li_b_l")
        # print(see)
        #公司介绍
        tags = soup.select("div.list_item_bot > div.li_b_l")
        # print(tags)
        #公司名称
        company = soup.select("div.company_name > a")
        # print(company)
        #公司性质
        d_s = soup.select("div.industry")
        # print(d_s)
        #公司福利
        adv = soup.select("div.li_b_r")
        # print(adv)
        #发布时间
        times = soup.select("div.list_item_top > div.position > div.p_top > span")
        # print(times)

        for i in range(len(title)):
            _title = title[i].text
            # print(_title)
            _loc = loc[i].text
            temp = list(see[i].stripped_strings)
            _salary = temp[0]
            # split 对参数指定的内容进行切割,返回切割之后的列表。
            temp = temp[1].split(" / ")
            _exp = temp[0]
            _edu = temp[1]
            _tags = ",".join(tags[i].stripped_strings)
            _company = company[i].text
            temp = d_s[i].text.strip()
            temp = temp.split(" / ")

            if len(temp) == 2:
                _domain1 = temp[0]
                _stage = temp[1]
            else:
                _domain1 = "无"
                _stage = "无"
            _adv = adv[i].text.strip("“”")
            time1 = times[i].text
                # print('---------------------')
            print([_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1])
        # "insert into lagou (t_job, t_addr, t_com, t_tag, t_money, t_edu, t_exp,t_time) VALUES (%s, %s, %s, %s, %s, %s, %s,%s)"
            lagou_insert_sql = "insert into lagou (_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1) VALUES (%s, %s, %s, %s, %s, %s, %s,%s,%s,%s,%s)"
            record = [_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1]
        #     record = ["a","a","a","a","a","a","a","a","a","a","a"]
            cursor.execute(lagou_insert_sql, record)
        time.sleep(3)

        con.commit()
for job_link in url_list:
    crawl(job_link)

# title = crawl("https://www.lagou.com/zhaopin/")
# print(title)

cursor.close()
con.close()



  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值