Python 多线程爬取水木社区实习板块
Library import
from bs4 import BeautifulSoup
import pandas as pd
import requests
import logging
import numpy as np
import re
import tqdm
from threading import *
import queue
import time
Target Links
start=323000
end = 327000
link_list=[]
for i in range(start,end):
link_list.append('http://www.newsmth.net/nForum/article/Intern/'+str(i)+'?ajax' )
Process Extends Thread Class
class ShuimuSpider(Thread):
def __init__(self,url,q):
super(ShuimuSpider,self).__init__()
self.url=url
self.q=q
def run(self):
self.parse_page()
def send_request(self,url):
try:
html = requests.get(url,timeout=2)
except Exception as e:
print("fail at %s" % url)
else:
return html
def parse_page(self):
response = self.send_request(self.url)
if response.status_code==200:
response_bs=response.content
try:
bs = BeautifulSoup(response_bs,'html.parser')
text=''
store_data={
'time':[],
'title':[],
'detail':[]
}
text = bs.p.text
# .p 对应html中的<p>, 观察帖子不难发现,几乎所有的招聘正文都在帖子的<p>内
# .text 对应输出文本
#分别通过特殊字符串的位置给标题、时间、正文定位
#position for title:
Title_Position1=text.find('标 题: ')+len('标 题: ')
Title_Position2=text.find(' 发信站: 水木社区')
Title = text[Title_Position1:Title_Position2]
#text[x:y]进行切片
#position for time:
Time_Position1=text.find(' 发信站: 水木社区')+len(' 发信站: 水木社区')
Time_Position2=text.find('), 站内')+1
Time=text[Time_Position1:Time_Position2]
#position for detail
Detail_Position1=text.find('), 站内')+len('), 站内')+4
Detail_Position2=text.find('※ 来源:·水木社区')
Detail=text[Detail_Position1:Detail_Position2]
store_data['time'].append(Time)
store_data['title'].append(Title)
store_data['detail'].append(Detail)
#字典赋值
#运行成功,index序列加一
self.q.put(store_data)
except:
return
else:
return
Scrapy function with multi-threading
def Fast_Scrapy(x,y,store_data):
start = time.time()
q= queue.Queue(0)
Thread_list=[]
for url in link_list[x:y]:
p = ShuimuSpider(url,q)
p.start()
Thread_list.append(p)
for i in Thread_list:
i.join()
while not q.empty():
data = q.get()
store_data['time'].append(data['time'])
store_data['title'].append(data['title'])
store_data['detail'].append(data['detail'])
end = time.time()
print('当前Process + Queue多线程爬虫的总时间为:', end-start)
Scrapy RUN
def Scrapy():
store_data={
'time':[],
'title':[],
'detail':[]
}
if end-start>150:
period = np.arange(0,end-start,150)
for i in tqdm.tqdm(period):
Fast_Scrapy(i,i+150,store_data)
Fast_Scrapy(i,end,store_data)
else:
Fast_Scrapy(start,end,store_data)
df = pd.DataFrame(store_data)
df.to_csv('../dataset/'+str(start)+'-'+str(end)+'.csv')
return df
%%time
df = Scrapy()
Data Cleaning
Remove '[]'
# remove '[]'
for col in ['time','title','detail']:
df[col]=df[col].apply(lambda x:x[0])
Transfer to pd.datetime
# transfer to pd.datetime
df['time']=(df['time'].str[2:12]+df['time'].str[21:-1])
df['time']=pd.to_datetime(df['time'],errors='coerce')
df['time']=df['time'].fillna(method='ffill')
Remove data with extremely short title and duplicated
df = df[df['title'].str.len()>5]
df = df[~df['title'].duplicated()]
df.reset_index(drop=True,inplace=True)
Remove datas without a delivering email
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern,flags=re.IGNORECASE)
df['Email']='NaN'
for i in range(len(df)):
try:
df.iloc[i,3]=regex.findall(df['detail'][i])
except:
continue
df.replace(to_replace='NaN',value = np.nan,regex=True,inplace =True)
df.dropna(axis=0,how='any',inplace =True)
df.reset_index(drop=True,inplace=True)
Output as text file
def output(df):
text=''
for i in range(len(df)):
text += str(df.index[i])+' '
text += df['title'][i]+'\n'
text += df['detail'][i]+'\n'
text += df['Email'][i]+'\n\n\n'
with open(str(start)+'-'+str(end)+'.txt','w',encoding='utf-8') as f:
f.write(text)
Classify
Load Classify Model
(模型训练过程见Github: https://github.com/YunmaoLeo/Crawler_Cluster_Classify
import joblib
clf_model=joblib.load('../models/Classification_model_5_n_estimator=100_pca=400.joblib')
Predict function
mark = ['人力资源', '其他', '券商投行基金', '投资咨询实习生', '数据分析挖掘', '研发开发测试', '算法实习',
'运营实习']
import jieba_fast
stopwords = pd.read_csv('../stopwords/baidu_stopwords.txt',
quoting=3,sep='\t',names=['stopword'],encoding='utf-8')
stopwords = stopwords['stopword'].values
def pre_input(input):
sentences=[]
segs=jieba_fast.cut(input,cut_all=False)
segs = list(filter(lambda x:x.strip(),segs))
segs=list(filter(lambda x:((x not in stopwords) and x != 'xa0' ),segs))
sentences.append(str([x for x in segs]))
return sentences
def predict_df(x):
return mark[clf_model.predict(pre_input(x))[0]]
此处的‘stopwords'可以删除,或使用自己的stopwords替换
Predict for Dataframe
df['category']=np.zeros(len(df))
df['category']=df['detail'].apply(lambda x:predict_df(x))
df[50:60][['title','category']]
title | ||
---|---|---|
50 | 【实习】【搜狐】市场部项目实习生-北京 | 其他 |
51 | 【实习】【银河证券行研】房地产组行业研究实习生招聘 | 券商投行基金 |
52 | 【实习】国信证券-私募直投子公司-国信弘盛-行业研究部-北京 | 券商投行基金 |
53 | 【MSRA】微软亚洲研究院系统组实习生-深度学习工具与系统方向 | 算法实习 |
54 | 【实习】【北京-券商行研】机械行业研究助理招聘 | 券商投行基金 |
55 | 【实习】【微博平台】Java研发(可内推) | 研发开发 |
56 | 【北京实习】TokenInsight公司招聘金融翻译实习生(懂区块链) | 运营实习 |
57 | 【北京 实习】TokenInsight招区块链行业研究实习生 | 投资咨询实习生 |
58 | 【光大证券(北京)研究所招聘】 | 券商投行基金 |
59 | 【实习】【百度】数据仓库研发实习生 | 研发开发 |
完整的分类训练过程及原代码ipynb见Github: https://github.com/YunmaoLeo/Crawler_Cluster_Classify