网址:
https://www.lagou.com/
在Mysql里面建立一个要爬取数据的表:
数据库建表语句:
create database if not exists spider;
use spider;
CREATE TABLE `lagou` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`t_job` varchar(255) DEFAULT NULL,
`t_addr` varchar(255) DEFAULT NULL,
`t_tag` varchar(255) DEFAULT NULL,
`t_com` varchar(255) DEFAULT NULL,
`t_money` varchar(255) DEFAULT NULL,
`t_edu` varchar(255) DEFAULT NULL,
`t_exp` varchar(255) DEFAULT NULL,
`t_time` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
show tables;
select * from lagou;
程序代码:
#! usr/bin/python3
#! -*- coding: utf-8 -*-
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import time
import pymysql
head = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"Cookie":"JSESSIONID=ABAAABAAAIAACBI2D7B8FFA2C068F071646B86911EFDF59; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1522318478; _ga=GA1.2.1811029983.1522318478; _gat=1; user_trace_token=20180329181720-5da621d4-333a-11e8-a3c4-525400f775ce; LGSID=20180329181720-5da622d0-333a-11e8-a3c4-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20180329181720-5da6248b-333a-11e8-a3c4-525400f775ce; _gid=GA1.2.1779902732.1522318478; index_location_city=%E5%85%A8%E5%9B%BD; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1522318487; LGRID=20180329181729-632f2563-333a-11e8-a3c4-525400f775ce; TG-TRACK-CODE=index_navigation"}
con = pymysql.connect(host="potter2",user="root",password="root",database="test", charset='utf8', port=3306)
cursor = con.cursor()
# lagou_insert_sql = "insert into lagou (_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1) VALUES (%s, %s, %s, %s, %s, %s, %s,%s,%s,%s,%s)"
# _title,_loc,_salary,_exp,_edu,_tags,_company,_domain,_stage,_adv,time1
"""
数据库建表语句:
create database if not exists spider;
use spider;
CREATE TABLE `lagou` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`t_job` varchar(255) DEFAULT NULL,
`t_addr` varchar(255) DEFAULT NULL,
`t_tag` varchar(255) DEFAULT NULL,
`t_com` varchar(255) DEFAULT NULL,
`t_money` varchar(255) DEFAULT NULL,
`t_edu` varchar(255) DEFAULT NULL,
`t_exp` varchar(255) DEFAULT NULL,
`t_time` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
show tables;
select * from lagou;
"""
def get_all_url():
url = "https://www.lagou.com/"
request1 = Request(url,headers=head)
response = urlopen(request1)
bs = BeautifulSoup(response,"html.parser")
lagou_list = bs.select("div.mainNavs a")
url_list = [link.get("href") for link in lagou_list]
url_list = set(url_list)
print(url_list)
return url_list
url_list = get_all_url()
print(url_list)
print(url_list.__len__())
def crawl(link):
for page in range(1,31):
url = link + str(page) + "/"
print("即将抓取第%d页数据,url为:%s"%(page,url))
request = Request(url,headers=head)
response = urlopen(request)
# 如果请求的url不等于服务器回应的url,说明本页已经没有数据,返回,
# 爬取下一个招聘岗位的链接。
if url != response.geturl():
print("第%d页没有数据,继续抓取下一个链接。"%page)
return
soup = BeautifulSoup(response,"html.parser")
#职位
title = soup.select("a.position_link h3")
# print(title)
#地址
loc = soup.select("span.add em")
# print(loc)
#经验跟学历
see = soup.select("div.p_bot > div.li_b_l")
# print(see)
#公司介绍
tags = soup.select("div.list_item_bot > div.li_b_l")
# print(tags)
#公司名称
company = soup.select("div.company_name > a")
# print(company)
#公司性质
d_s = soup.select("div.industry")
# print(d_s)
#公司福利
adv = soup.select("div.li_b_r")
# print(adv)
#发布时间
times = soup.select("div.list_item_top > div.position > div.p_top > span")
# print(times)
for i in range(len(title)):
_title = title[i].text
# print(_title)
_loc = loc[i].text
temp = list(see[i].stripped_strings)
_salary = temp[0]
# split 对参数指定的内容进行切割,返回切割之后的列表。
temp = temp[1].split(" / ")
_exp = temp[0]
_edu = temp[1]
_tags = ",".join(tags[i].stripped_strings)
_company = company[i].text
temp = d_s[i].text.strip()
temp = temp.split(" / ")
if len(temp) == 2:
_domain1 = temp[0]
_stage = temp[1]
else:
_domain1 = "无"
_stage = "无"
_adv = adv[i].text.strip("“”")
time1 = times[i].text
# print('---------------------')
print([_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1])
# "insert into lagou (t_job, t_addr, t_com, t_tag, t_money, t_edu, t_exp,t_time) VALUES (%s, %s, %s, %s, %s, %s, %s,%s)"
lagou_insert_sql = "insert into lagou (_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1) VALUES (%s, %s, %s, %s, %s, %s, %s,%s,%s,%s,%s)"
record = [_title,_loc,_salary,_exp,_edu,_tags,_company,_domain1,_stage,_adv,time1]
# record = ["a","a","a","a","a","a","a","a","a","a","a"]
cursor.execute(lagou_insert_sql, record)
time.sleep(3)
con.commit()
for job_link in url_list:
crawl(job_link)
# title = crawl("https://www.lagou.com/zhaopin/")
# print(title)
cursor.close()
con.close()