Python request多线程爬取实习岗位信息并利用xgboost模型进行分类

Python 多线程爬取水木社区实习板块

Library import

from bs4 import BeautifulSoup
import pandas as pd
import requests
import logging
import numpy as np
import re
import tqdm
from threading import *
import queue
import time

Target Links

start=323000
end = 327000
link_list=[]
for i in range(start,end):
    link_list.append('http://www.newsmth.net/nForum/article/Intern/'+str(i)+'?ajax' )

 Process Extends Thread Class

class ShuimuSpider(Thread):
    def __init__(self,url,q):
        super(ShuimuSpider,self).__init__()
        self.url=url
        self.q=q
        
    def run(self):
        self.parse_page()
        
    def send_request(self,url):
        try:
            html = requests.get(url,timeout=2)
        except Exception as e:
            print("fail at %s" % url)
        else:
            return html
    def parse_page(self):
        response = self.send_request(self.url)
        if response.status_code==200:
            response_bs=response.content
            try:
                bs = BeautifulSoup(response_bs,'html.parser')


                text=''
                store_data={
                    'time':[],
                    'title':[],
                    'detail':[]
                }
                text = bs.p.text
                # .p 对应html中的<p>, 观察帖子不难发现,几乎所有的招聘正文都在帖子的<p>内
                # .text 对应输出文本

                #分别通过特殊字符串的位置给标题、时间、正文定位
                #position for title:
                Title_Position1=text.find('标  题: ')+len('标  题: ')
                Title_Position2=text.find('  发信站: 水木社区')
                Title = text[Title_Position1:Title_Position2]
                #text[x:y]进行切片

                #position for time:
                Time_Position1=text.find('  发信站: 水木社区')+len('  发信站: 水木社区')
                Time_Position2=text.find('), 站内')+1
                Time=text[Time_Position1:Time_Position2]

                #position for detail
                Detail_Position1=text.find('), 站内')+len('), 站内')+4
                Detail_Position2=text.find('※ 来源:·水木社区')
                Detail=text[Detail_Position1:Detail_Position2]

                store_data['time'].append(Time)
                store_data['title'].append(Title)
                store_data['detail'].append(Detail)
                #字典赋值
                #运行成功,index序列加一

                self.q.put(store_data)
            except:
                return
        else:
            return

Scrapy function with multi-threading

def Fast_Scrapy(x,y,store_data):
    start = time.time()
    q= queue.Queue(0)

    Thread_list=[]
    for url in link_list[x:y]:
        p = ShuimuSpider(url,q)
        p.start()
        Thread_list.append(p)

    for i in Thread_list:
        i.join()

    while not q.empty():
        data = q.get()

        store_data['time'].append(data['time'])
        store_data['title'].append(data['title'])
        store_data['detail'].append(data['detail'])
    end = time.time()
    print('当前Process + Queue多线程爬虫的总时间为:', end-start)

Scrapy RUN

def Scrapy():
    store_data={
        'time':[],
        'title':[],
        'detail':[]
    }
    if end-start>150:
        period = np.arange(0,end-start,150)

        for i in tqdm.tqdm(period):
            Fast_Scrapy(i,i+150,store_data)
        Fast_Scrapy(i,end,store_data)
    else:
        Fast_Scrapy(start,end,store_data)
        
    df = pd.DataFrame(store_data)
    df.to_csv('../dataset/'+str(start)+'-'+str(end)+'.csv')
    return df
%%time
df = Scrapy()

Data Cleaning

Remove '[]'

# remove '[]'
for col in ['time','title','detail']:
    df[col]=df[col].apply(lambda x:x[0])

Transfer to pd.datetime

# transfer to pd.datetime
df['time']=(df['time'].str[2:12]+df['time'].str[21:-1])
df['time']=pd.to_datetime(df['time'],errors='coerce')
df['time']=df['time'].fillna(method='ffill')

Remove data with extremely short title and duplicated

df = df[df['title'].str.len()>5]
df = df[~df['title'].duplicated()]
df.reset_index(drop=True,inplace=True)

Remove datas without a delivering email

pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern,flags=re.IGNORECASE)
df['Email']='NaN'
for i in range(len(df)):
    try:
        df.iloc[i,3]=regex.findall(df['detail'][i])
    except:
        continue
df.replace(to_replace='NaN',value = np.nan,regex=True,inplace =True)
df.dropna(axis=0,how='any',inplace =True)
df.reset_index(drop=True,inplace=True)

Output as text file

def output(df):
    text=''
    for i in range(len(df)):
        text += str(df.index[i])+' '
        text += df['title'][i]+'\n'
        text += df['detail'][i]+'\n'
        text += df['Email'][i]+'\n\n\n'
    
    with open(str(start)+'-'+str(end)+'.txt','w',encoding='utf-8') as f:
        f.write(text)

Classify

Load Classify Model

(模型训练过程见Github: https://github.com/YunmaoLeo/Crawler_Cluster_Classify

import joblib
clf_model=joblib.load('../models/Classification_model_5_n_estimator=100_pca=400.joblib')

Predict function

mark = ['人力资源', '其他', '券商投行基金', '投资咨询实习生', '数据分析挖掘', '研发开发测试', '算法实习',
       '运营实习']
import jieba_fast

stopwords = pd.read_csv('../stopwords/baidu_stopwords.txt',
                        quoting=3,sep='\t',names=['stopword'],encoding='utf-8')
stopwords = stopwords['stopword'].values

def pre_input(input):
    sentences=[]
    segs=jieba_fast.cut(input,cut_all=False)
    segs = list(filter(lambda x:x.strip(),segs))
    segs=list(filter(lambda x:((x not in stopwords) and x != 'xa0' ),segs))
    sentences.append(str([x for x in segs]))
    return sentences

def predict_df(x):
    return mark[clf_model.predict(pre_input(x))[0]]

此处的‘stopwords'可以删除,或使用自己的stopwords替换

Predict for Dataframe

df['category']=np.zeros(len(df))
df['category']=df['detail'].apply(lambda x:predict_df(x))
df[50:60][['title','category']]

 

 title
50【实习】【搜狐】市场部项目实习生-北京其他
51【实习】【银河证券行研】房地产组行业研究实习生招聘券商投行基金
52【实习】国信证券-私募直投子公司-国信弘盛-行业研究部-北京券商投行基金
53【MSRA】微软亚洲研究院系统组实习生-深度学习工具与系统方向算法实习
54【实习】【北京-券商行研】机械行业研究助理招聘券商投行基金
55【实习】【微博平台】Java研发(可内推)研发开发
56【北京实习】TokenInsight公司招聘金融翻译实习生(懂区块链)运营实习
57【北京 实习】TokenInsight招区块链行业研究实习生投资咨询实习生
58【光大证券(北京)研究所招聘】券商投行基金
59【实习】【百度】数据仓库研发实习生研发开发

完整的分类训练过程及原代码ipynb见Github: https://github.com/YunmaoLeo/Crawler_Cluster_Classify
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值