前程无忧爬虫–仅供学习使用
前程无忧职位链接:https://search.51job.com/list/090200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
先右键检查分析网页,这里我们已经找到了详情页的链接
可以看到详情页的链接就在a标签里面,我们可以使用xpath语法来进行提取。urls = html.xpath("//div[@class='dw_table']//div[@class='el']/p/span/a/@href") 请求这个网页,我们就可以进入详情页了。
这里就是详情页了,可以看到左边的信息都可以在右边的源代码中看到,接下来用xpath提取就可以了。
网页分析完毕,接下来的时间交给代码了
提取网页的详情链接函数:
def get_urls():
for i in range(1,46):#限制页数。
print("正在获取第{}页的数据".format(i))
url = 'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?'.format(i)
response = requests.get(url,headers=headers)
html = etree.HTML(response.text)
urls = html.xpath("//div[@class='dw_table']//div[@class='el']/p/span/a/@href")
# print(urls)
parse_urls(urls)
解析详情页面,提取数据:
def parse_urls(urls):
for ul in urls:
try:
print(ul)
response = requests.get(ul,headers=headers)
response.encoding='gbk'
html = etree.HTML(response.text)
# print(response.text)
position_name = html.xpath("//div[@class='cn']/h1/text()")[0]#职位名称
company_name = html.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/text()")[0]#公司名称
address = html.xpath("//div[@class='cn']/p[2]/text()")[0]#地址
salary = html.xpath("//div[@class='cn']/strong/text()")[0]#工资
induction_requirements = html.xpath("//div[@class='cn']/p[2]/text()")[1]#入职要求
education = html.xpath("//div[@class='cn']/p[2]/text()")[2]#学历
number = html.xpath("//div[@class='cn']/p[2]/text()")[3]#招聘人数
release_time = html.xpath("//div[@class='cn']/p[2]/text()")[4]#发布时间
print(position_name,company_name,address,salary,induction_requirements,education,number,release_time)
datas = [position_name,company_name,address,salary,induction_requirements,education,number,release_time]
# writer.writerow(datas)
except Exception as e:
print('错误:{},数据不齐,丢弃'.format(e))
完整代码如下:
#Time:2020/03/29
#author:渔戈
import requests
from lxml import etree
import csv
#将数据写入csv文件
fp = open('前程无忧.csv','a',encoding='utf-8',newline='')
writer = csv.writer(fp)#初始化csv文件
header =['position_name','company_name','address','salary','induction_requirements','education','number','release_time']
writer.writerow(header)#写入表头
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
}
def get_urls():
for i in range(1,46):#限制页数,最多有45页。
print("正在获取第{}页的数据".format(i))
url = 'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?'.format(i)
response = requests.get(url,headers=headers)
html = etree.HTML(response.text)
urls = html.xpath("//div[@class='dw_table']//div[@class='el']/p/span/a/@href")
# print(urls)
parse_urls(urls)
def parse_urls(urls):
for ul in urls:
try:
print(ul)
response = requests.get(ul,headers=headers)
response.encoding='gbk'
html = etree.HTML(response.text)
# print(response.text)
position_name = html.xpath("//div[@class='cn']/h1/text()")[0]#职位名称
company_name = html.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/text()")[0]#公司名称
address = html.xpath("//div[@class='cn']/p[2]/text()")[0]#地址
salary = html.xpath("//div[@class='cn']/strong/text()")[0]#工资
induction_requirements = html.xpath("//div[@class='cn']/p[2]/text()")[1]#入职要求
education = html.xpath("//div[@class='cn']/p[2]/text()")[2]#学历
number = html.xpath("//div[@class='cn']/p[2]/text()")[3]#招聘人数
release_time = html.xpath("//div[@class='cn']/p[2]/text()")[4]#发布时间
print(position_name,company_name,address,salary,induction_requirements,education,number,release_time)
datas = [position_name,company_name,address,salary,induction_requirements,education,number,release_time]
writer.writerow(datas)
except Exception as e:
print('错误:{},数据不齐,丢弃'.format(e))
if __name__ == '__main__':
get_urls()
fp.close()