使用爬虫
- 使用request获取网页
- 通过PyQuery进行解析数据
- data.to_excel('./111.xls')
<1> 创建连接
- url是网址, 传入数据为需要爬取的网页地址
- page根据网站更新逻辑进行调整, 用于适配页码
- headers 根据浏览器进行调整, F12然后查看浏览器信息, 中间用分号分隔
- 匿名IP: request.get(url, headers=headers, proxies=proxy)
from pyquery import PyQuery as pq
import requests
from bs4 import BeautifulSoup
import openpyxl
import numpy as np
import pandas as pd
def send_request(page):
url = 'https://cq.58.com/zpshengchankaifa/pn'+str(page)
print('网址:', url)
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}
# 匿名IP
proxy = {'http':'103.149.146.34:80'}
resp = requests.get(url, headers=headers, proxies=proxy)
return resp.text
<2> 解析数据
- doc = pd(resp.text) # 'span.address' span中class下面的数据用 span.address 取文件
- doc('div.comp_name a') # doc 取标签div中class下面的a标签 文件
# 解析数据
def parse_html():
job_salarys, address_s, names, companys, cates = [], [], [], [], []
for i in range(1, 10):
print('页码', i)
doc = pq(send_request(i))
job_salary = [x.text.split('-')[0] for x in doc('p.job_salary')] # 工资
address = [x.text for x in doc('span.address')] # 地址
name = [x.text for x in doc('span.name')] # 工作简介
company = [x.text for x in doc('div.comp_name a')] # 公司地址
cate = [x.text for x in doc('span.cate')] # 职位
job_salarys += job_salary
address_s += address
names += name
companys += company
cates += cate
print('数据长度:', len(job_salarys), len(address_s), len(names), len(companys))
data = pd.DataFrame(data = {'names':names, 'job_salarys':job_salarys, 'address':address_s, 'companys':companys, 'cates':cates})
save(data) # 存储数据
return data
<3> 标签处理 + 数据去重
def salary_input(salary):
min_salary = salary.split('-')[0]
if min_salary[-1] == '千':
min_salary = min_salary[0] + str('000')
elif min_salary[-1] == '万':
min_salary = min_salary[0] + str('0000')
return min_salary
def drop_repeat(lst, lst2, lst3, lst4, lst5):
drop_lst = []
# print('原始工作类型', lst)
for i in range(len(lst)):
for j in range(i+1, len(lst)) :
if lst[j] == lst[i]:
print(lst[i])
drop_lst.append(i)
break
# print(drop_lst)
for k in drop_lst[::-1]:
lst.pop(k)
lst2.pop(k)
lst3.pop(k)
lst4.pop(k)
lst5.pop(k)
return lst, lst2, lst3, lst4, lst5
<4> 保存Excel文件
- data.to_excel('./111.xls')
def save(data):
data.to_excel('./58job_pugong_x2_1.xls',
sheet_name='job',
header=True,
index=False)
job_data = pd.read_excel('./58job_pugong.xls',
sheet_name=0,# 读取哪一个Excel中工作表,默认第一个
header = 0)
job_data.head(60)