python爬取拉钩网数据

最新推荐文章于 2021-03-21 20:33:53 发布

liuzemeeting

最新推荐文章于 2021-03-21 20:33:53 发布

阅读量425

点赞数

分类专栏： python 爬虫文章标签：爬虫

本文链接：https://blog.csdn.net/liuzemeeting/article/details/79212404

版权

python 同时被 2 个专栏收录

14 篇文章 0 订阅

订阅专栏

爬虫

6 篇文章 0 订阅

订阅专栏

import requests
import re#引用正则匹配
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}#伪装浏览器，制作一个请求头
def local():
    url="https://www.lagou.com/";
    response=requests.get(url,headers = headers);
    result=response.text;
    rep=r' <a href="(.*?)" data-lg-tj-id="4A00" data-lg-tj-no=".*?" data-lg-tj-cid="idnull">(.*?)</a>'
    result=re.findall(rep,result)
    return result

def postion(url):
    response=requests.get(url,headers = headers)
    ggg = [];
    soup = BeautifulSoup(response.text, 'html.parser')
    for news in soup.select('.default_list'):  # 定位
        # print(news)
        place = news.find_all(class_='add')[0].text
        ggg.append(place)
        companyName = news.select('a')[1].text
        ggg.append(companyName)
        companyClass = news.find_all(class_='industry')[0].text.replace(' ', '')
        ggg.append(companyClass)
        companySpeak = news.find_all(class_='li_b_r')[0].text
        ggg.append(companySpeak)
        workMoney = news.find_all(class_='money')[0].text
        ggg.append(workMoney)
        workNeed = news.find_all(class_='li_b_l')[0].text.split('k')[-1]
        ggg.append(workNeed)
        url = news.find_all(class_='position_link')[0]['href']
        ggg.append(url)

    return ggg
for url,title in local():
    result=postion(url)
    for item in result:
        print(item)