话不多说,直接上例子
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import time
import random
import pymysql
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import sys
reload(sys)
sys.setdefaultencoding('utf8')
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
time.sleep(random.random() * 6)
def get_info(URL):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'}
wb_data = requests.get(URL, headers=headers)
soup = BeautifulSoup(wb_data.content, 'lxml')
JobNameList = soup.select('span.pos_name')
if JobNameList:
JobName = JobNameList[0].text.strip()
else:
JobName = ''
JobSalarList = soup.select('span.pos_salary')
if JobSalarList:
JobSalar = JobSalarList[0].text.strip();
else:
JobSalar = ''
JobUpdateList = soup.select('span.pos_base_update')
if JobUpdateList:
JobUpdate = JobUpdateList[0].text.strip();
else:
JobUpdate = ''
JobTitleList = soup.select('span.pos_title')
if JobTitleList:
JobTitle = JobTitleList[0].text.strip()
else:
JobTitle = ''
JobWelfareList = soup.select('span.pos_welfare_item')
JobWelfare = ''
if JobWelfareList:
for welfare in JobWelfareList:
JobWelfare += welfare.text.strip();
else:
JobWelfare = ''
JobDesList = soup.select('div.des')
if JobDesList:
JobDes = JobDesList[0].text.strip();
else:
JobDes = '';
CompanyNameList = soup.select('div.baseInfo_link > a')
if CompanyNameList:
CompanyNameUrl = CompanyNameList[0].get('href')
CompanyName = CompanyNameList[0].text.strip();
else:
CompanyNameUrl = '';
CompanyName = '';
ShuliangList = soup.select('span.item_condition')
if ShuliangList:
Shuliang = ShuliangList[0].text.strip() + ShuliangList[1].text.strip() + ShuliangList[2].text.strip();
else:
Shuliang = ''
print(JobName),
print(JobSalar),
print(JobUpdate),
print(JobTitle),
print(JobWelfare),
print(JobDes),
print(CompanyName),
print(CompanyNameUrl),
print(Shuliang)
# 连接数据库
conn = pymysql.connect(host="localhost", user="root", passwd="root",
db="wuba", port=3306, charset="utf8")
cur = conn.cursor() # 创建游标对象
insert_mysql = "INSERT INTO wu_job (JobName, JobSalar, JobUpdate, JobTitle, JobWelfare, JobDes, CompanyName,CompanyNameUrl, Shuliang ,url)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cur.execute(insert_mysql,
(str(JobName), str(JobSalar), str(JobUpdate), str(JobTitle), str(JobWelfare), str(JobDes), str(CompanyName),str(CompanyNameUrl),
str(Shuliang), str(URL)))
conn.commit();
print('ok');
conn.close();
cur.close();
def get_all_info():
url = 'https://bz.58.com/songcanyuan/?PGTID=0d202408-003b-0c4c-00d4-d1e7471f2361&ClickID=1'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'}
wb_data = requests.get(url, headers=headers, verify=False)
soup = BeautifulSoup(wb_data.text, 'lxml')
GetLink = soup.select('div.job_name > a')
for i in GetLink:
link = i.get('href')
get_info(link)
get_all_info()