BeautifulSoup的标准选择器法---find_all() / findAll()方法实战之全程无忧网页数据爬取

#--BS,find()和findAll()方法

from bs4 import BeautifulSoup as bs
import requests as res 
import time
# f = open(r'E:\PythonData\非代码数据\前程无忧源码.txt',encoding = 'gbk').read()
(post1,comp1,site1,salary1) = ([],[],[],[])  
for page in range(1,3):
    url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(page)
    head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'}
    html = res.get(url,headers = head)
    html.encoding = 'gbk'
    soup = bs(html.text,'html.parser')
    time.sleep(3) 
              
    for i in soup.find('div', class_="dw_table",id="resultList").findAll(class_="el")[1:]: 
        try:      # 给新手看的注释,try-except可以有效避免程序崩溃
            post =  i.find('a',target="_blank").string.strip()
        except:
            post = ""  
        try:
            comp = i.find('span', class_="t2").string.strip()
        except:     # 给萌新看的,可以试一下标签选择和标准选择组合的方法,更简便
            comp = ""
            
        try:
            site = i.find('span', class_="t3").string.strip()
        except:
            site = ''
            
        try:
            salary = i.find('span', class_="t4").string.strip()
        except:
            salary = ''
        post1.append(post)
        comp1.append(comp)
        site1.append(site)
        salary1.append(salary)
    print('第%d页'%page)

print(len(post1),len(comp1),len(site1),len(salary1))
import pandas as pd
dict1 = {'post':post1,'company':comp1,'site':site1,'salary':salary1}
df = pd.DataFrame(dict1)

df.to_csv(r'E:\PythonData\Homework\全程无忧2.csv',encoding= 'gbk')
df

 

  • 3
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值