快速问医生（ask120.com）——在线数据爬取与接受度预测

最新推荐文章于 2023-12-25 23:48:48 发布

weiwen6933

最新推荐文章于 2023-12-25 23:48:48 发布

阅读量1.8k

点赞数 2

本文链接：https://blog.csdn.net/weiwen6933/article/details/104908768

版权

数据爬取

#导入所需包
import requests
from bs4 import BeautifulSoup
from lxml import etree
import time
import csv

fp = open('内分泌.csv','a',newline='',encoding='utf_8_sig')
writer = csv.writer(fp)

headers={
  'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:70.0) Gecko/20100101 Firefox/70.0'
}

#网页内容格式调整
#内分泌科室在线问诊网址
url = 'https://www.120ask.com/list/nfmk/'
#此处爬取2-101页，一共200页
for i in range(2,102):
    r = url+str(i)+'/'
    html = requests.get(r,headers=headers)
    html.raise_for_status()
    html.encoding = html.apparent_encoding     
    soup = BeautifulSoup(html.text,'html.parser')
	#爬取所有一级链接，以进入详细页面    
	for item in soup.find_all('p','h-pp1'):   
        link = 'https:'+ item.find('a','q-quename')['href']
		date_html=requests.get(link,headers=headers).text
		f = etree.HTML(date_html) 
		#提问者性别与年龄
		ques_gender_age = f.xpath('/html/body/div[1]/div[5]/div[2]/div[3]/div[1]/div/span[1]/text()')[0]
	   	#问题描述
	    ques_des = f.xpath('/html/body/div[1]/div[5]/div[2]/div[3]/div[2]/p[1]/text()')
	    ques_des= [''.join(x.split()) for x in ques_des]
	    while ques_des.count(''):
	    	ques_des.remove('')
	    ques_des = ques_des[0]
	    #print(ques_des)
	    #回复时间
	    ans_time =f.xpath('normalize-space(/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[2]/div[2]/span/text())')
	    #print(ans_time)
	    #回复者职称
	    anser_position = f.xpath('/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[1]/div/span[1]/text()')
	    anser_position= [''.join(x.split()) for x in anser_position]
	    while anser_position.count(''):
	    	anser_position.remove('')
	    #anser_position = anser_position[0]
	    #print(anser_position)
	    #回复者擅长领域
	    anser_good_at = f.xpath('/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[1]/div/span[2]/text()')
	    #print(anser_good_at)
	    #回复内容
	    anser_content =f.xpath('/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[2]/div[2]/div[1]/div[1]/p/text()')
	    anser_content= [''.join(x.split()) for x in anser_content]
	    #print(anser_content)     
	    #提问时间
	    release_time = f.xpath('//*[@id="body_main"]/div[5]/div[2]/div[3]/div[1]/div/span[2]/text()')[0]
	    #print(release_time)
	    #回复者诊疗经验
	    anser_help_amout = f.xpath('//*[@id="body_main"]/div[5]/div[2]/div[7]/div[1]/div[1]/div/span[3]/text()')
	    anser_help_amout = [''.join(x.split()) for x in anser_help_amout]
	    while anser_help_amout.count(''):
	    	anser_help_amout.remove('')
	    #print(anser_help_amout)
       
	    writer.writerow((release_time,ques_gender_age,ques_des,ans_time,anser_position,anser_good_at,anser_help_amout,anser_content))
        
fp.close()

for i in range(2,102):
    r = url+str(i)+'/'
    html = requests.get(r,headers=headers)
    html.raise_for_status()
    html.encoding = html.apparent_encoding     
    soup = BeautifulSoup(html.text,'html.parser')
    #f1 = etree.HTML(html) 
    
    for item in soup.find_all('p','h-pp1'):
    	#问题标题
        ques_title = item.find('a','q-quename')['title']
        #所在科室
        department = item.find('a').get_text()
        writer.writerow((department,ques_title))       
fp.close()

for i in range(2,102):
    r = url+str(i)+'/'
    html = requests.get(r,headers=headers)
    html.raise_for_status()
    html.encoding = html.apparent_encoding     
    soup = BeautifulSoup(html.text,'html.parser')
    
    for item in soup.find_all('div','fr h-right-p'):
        ans_amout1= item.find_next()
        #回复状态（预测变量）
        ans_status = ans_amout1.find_next().get_text()
        #回复数
        ans_amout = ans_amout1.get_text()
        writer.writerow((ans_amout,ans_status))  
fp.close()

以上爬取结束后，通过excel进行整合
在这里插入图片描述

数据处理与分析

数据量化

在这里插入图片描述

数据导入

import pandas as pd
import xlrd
features = pd.read_excel('moredat1.xlsx')
features.head(5)

在这里插入图片描述

数据大小

print('The shape of our features is:', features.shape)

The shape of our features is: (2414, 12)

# Descriptive statistics for each column
features.describe()

在这里插入图片描述

数据预处理

标签与数据格式转换

import numpy as np
labels = np.array(features['adoption'])
features= features.drop('adoption', axis = 1)
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

训练集与测试集划分

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.4,random_state = 42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)#1448
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)#966

Training Features Shape: (1448, 11)
Training Labels Shape: (1448,)
Testing Features Shape: (966, 11)
Testing Labels Shape: (966,)

建立一个基础的随机森林模型


from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 10, random_state=44)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)]
accuracy = (sum(map(int, correct)))/966
print ('accuracy = {0}%'.format(accuracy))

accuracy = 0.860248447204969%

可视化展示树


from sklearn.tree import export_graphviz
import pydot 
tree = rf.estimators_[0]
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png');

在这里插入图片描述
树太大了，我们缩减一下深度

rf_small = RandomForestClassifier(n_estimators=10, max_depth = 3, random_state=42)
rf_small.fit(train_features, train_labels)
tree_small = rf_small.estimators_[0]
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');

在这里插入图片描述

特征重要性

importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: reply_speed Importance: 0.18
Variable: doc_help_amount Importance: 0.16
Variable: reply_content Importance: 0.16
Variable: age Importance: 0.15
Variable: reply_amount Importance: 0.12
Variable: department Importance: 0.08
Variable: hospital_level Importance: 0.05
Variable: doc_position Importance: 0.04
Variable: sex Importance: 0.03
Variable: doc_liked Importance: 0.03
Variable: doc_good_at Importance: 0.02

import matplotlib.pyplot as plt

x_values = list(range(len(importances)))
plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation='vertical')
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title(' Random Forest Variable Importances');

在这里插入图片描述

建立一个基础的GDBT模型

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(min_samples_split=50, n_estimators=10, learning_rate=0.1, random_state=0)
gbdt.fit(train_features, train_labels)
predictions = gbdt.predict(test_features)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)]
accuracy = (sum(map(int, correct)))/966
print ('accuracy = {0}%'.format(accuracy))

accuracy = 0.8643892339544513%

特征重要性

importances = list(gbdt.feature_importances_)

feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: reply_amount Importance: 0.58
Variable: department Importance: 0.18
Variable: age Importance: 0.13
Variable: reply_speed Importance: 0.07
Variable: doc_help_amount Importance: 0.03
Variable: sex Importance: 0.0
Variable: hospital_level Importance: 0.0
Variable: doc_position Importance: 0.0
Variable: doc_good_at Importance: 0.0
Variable: doc_liked Importance: 0.0
Variable: reply_content Importance: 0.0

x_values = list(range(len(importances)))
plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation='vertical')
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('GBDT Variable Importances');

在这里插入图片描述

Logistic模型

最优参数构造

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# 首先做一个最优参数的构造
parameters = {
    "penalty": ['l1', 'l2'],
    "C": [0.01, 0.1, 1],
    "fit_intercept": [True, False],
    "max_iter": [100, 150, 200]
}
clf = GridSearchCV(LogisticRegression(random_state=0), param_grid=parameters, cv=3)
clf.fit(train_features, train_labels)
# 得到最优参数
print("最优参数:", end="")
print(clf.best_params_)

最优参数:{‘penalty’: ‘l2’, ‘max_iter’: 100, ‘C’: 0.1, ‘fit_intercept’: False}

lr = LogisticRegression(C = 0.1, fit_intercept=False, max_iter=100, penalty='l2', random_state=0)
lr.fit(train_features, train_labels)

predictions = lr.predict(test_features)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)]
accuracy = (sum(map(int, correct)))/966
print ('accuracy = {0}%'.format(accuracy))

accuracy = 0.8633540372670807%

分析小结

年龄与其他因素相关性

在这里插入图片描述

不同科室特征区分

在这里插入图片描述

策略提供
For Ask120 platform:In order to make the “Ask120” platform increase the number of adoptions, so as to obtain higher profits and greater social effects. The platform can be optimized by the following measures:

Utlizing the delay of patients’ responses, accurately recommend hospitals and drugs and promote online traffic transformation through prediction.
Patients generally trust doctors with rich online diagnosis and treatment experience. The platform can distribute problems evenly and improve the average diagnosis and treatment rate of doctors on the platform.
Patients of different ages have different characteristics when they interact. The older , the lower the acceptance is, and they prefer the number of responses to weigh the treatment plan.
Different departments should improve corresponding indicators.

weiwen6933

关注

2
点赞
踩
15

收藏

觉得还不错? 一键收藏
2
评论
快速问医生（ask120.com）——在线数据爬取与接受度预测

数据爬取#导入所需包import requestsfrom bs4 import BeautifulSoupfrom lxml import etreeimport timeimport csvfp = open('内分泌.csv','a',newline='',encoding='utf_8_sig')writer = csv.writer(fp)headers={ '...
复制链接

扫一扫