前篇已经获取行家主页的URL。这次工作主要是获取主页上的数据。
看看主页上有啥
选一个行家主页进去看看有什么数据。
可能需要的东西都用红框标出。除了最上面地址栏中的行家编号,下面用红框标出的内容分别有所在城市、职业简述(文本数据可以考虑做个文本分类)、评分、响应率、约聊人数和行家的照片(照片信息可以获取其性别、年龄区间等,用CNN训练个分类器即可,下次说)。
网页继续往下拉。
到话题部分了,框住的有话题内容,话题价格(有点高。。。)等。
接着往下拉,看下还有什么。
评论信息部分,有评论数量、评论内容、行家回复内容。
获取数据
除了评论部分的数据,都可以直接在Element中找到,不详细说明,代码如下。
def get_data(ite_):
# 利用编号构造主页URL
url = 'https://www.zaih.com/falcon/mentors/' + ite_
pattern_1 = re.compile(r'[\s+¥]')
pattern_2 = re.compile(r'[(](.*)[)]')
page = urlopen(url)
bs0bj = BeautifulSoup(page, 'lxml')
# 下载图片
hj_image_url = bs0bj.find('div', {'class': 'mentor-info__avatar'})
image_url = hj_image_url['style']
url = pattern_2.findall(image_url)
image_download_url = ite_ + '.png'
urlretrieve(url[0], image_download_url)
# 基础信息
hj_basic = []
hj_name = bs0bj.find('div', {'class': 'mentor-info__name'}).get_text()
hj_basic.append(hj_name)
hj_location = bs0bj.find('div', {'class': 'mentor-info__city'}).get_text()
hj_basic.append(hj_location)
hj_job = bs0bj.find('div', {'class': 'mentor-info__title'}).get_text()
hj_basic.append(hj_job)
hj_ = bs0bj.findAll('span', {'class': 'content'})
hj_score = hj_[0].get_text()
hj_basic.append(hj_score)
hj_re = hj_[1].get_text()
hj_basic.append(hj_re)
hj_num = hj_[2].get_text()
hj_basic.append(hj_num)
item_change = [pattern_1.sub('', item) for item in hj_basic]
# 话题部分
topic = bs0bj.findAll('li', {'class': 'topic-item'})
topic_num = len(topic)
hj_topic = [topic_num]
for i in range(topic_num):
word = topic[i].find('div', {'class': 'topic-item__content'}).find('p').get_text()
hj_topic.append(word)
price = topic[i].find('span', {'class': 'price'}).get_text()
price_change = pattern_1.sub('', price)
hj_topic.append(price_change)
return item_change, hj_topic
评论部分的问题是爬取评论类数据最常遇到的问题,页面动态加载更多评论。在行的行家主页每次会显示十条评论(用户评论数比上面多了一个呃,这张图比之前那张晚截了几天)。
F12
打开开发者工具,观察Network下Xhr的传输内容,如下:
如图,可以看到响应数据包括十条评论信息,我们需要comments和replies部分的内容。观察请求头的信息,如下:
Request Method为GET,Request URL也仅仅需要改变页数page的参数,即可获得相应评论页的评论信息。此外,Response Headers中有评论总数x-total-count,可用于计算总页数。如下图:
代码如下:
def get_comment(ite_):
comment = []
url = 'https://www.zaih.com/falcon/meet_api/v1/mentors/' + ite_ + '/comments?page=1&per_page=10'
# 评论总数,可在请求头中
file = urlopen(url).info()['x-total-count']
file = int(file)
comment.append(file)
# 获取每条评论
# XHR每次会返回10条评论,计算返回的次数
xhr_page = math.ceil(file / 10)
comment_url_p1 = 'https://www.zaih.com/falcon/meet_api/v1/mentors/' + ite_ + '/comments?page='
comment_url_p2 = '&per_page=10'
for i in range(xhr_page):
comment_url = comment_url_p1 + str(i+1) + comment_url_p2
html = requests.get(comment_url).json()
num_per_page = len(html)
for j in range(num_per_page):
hj_com = html[j]['comments']
hj_com = pattern_1.sub('', hj_com)
comment.append(hj_com)
a = html[j]['replies']
if len(a) != 0:
x = pattern_1.sub('', a[0]['content'])
comment.append(x)
else:
comment.append('无')
return comment
代码汇总
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests
from urllib.request import urlretrieve
import math
def get_data(ite_):
# 利用编号构造主页URL
url = 'https://www.zaih.com/falcon/mentors/' + ite_
pattern_1 = re.compile(r'[\s+¥]')
pattern_2 = re.compile(r'[(](.*)[)]')
page = urlopen(url)
bs0bj = BeautifulSoup(page, 'lxml')
# 下载图片
hj_image_url = bs0bj.find('div', {'class': 'mentor-info__avatar'})
image_url = hj_image_url['style']
url = pattern_2.findall(image_url)
image_download_url = ite_ + '.png'
urlretrieve(url[0], image_download_url)
# 基础信息
hj_basic = []
hj_name = bs0bj.find('div', {'class': 'mentor-info__name'}).get_text()
hj_basic.append(hj_name)
hj_location = bs0bj.find('div', {'class': 'mentor-info__city'}).get_text()
hj_basic.append(hj_location)
hj_job = bs0bj.find('div', {'class': 'mentor-info__title'}).get_text()
hj_basic.append(hj_job)
hj_ = bs0bj.findAll('span', {'class': 'content'})
hj_score = hj_[0].get_text()
hj_basic.append(hj_score)
hj_re = hj_[1].get_text()
hj_basic.append(hj_re)
hj_num = hj_[2].get_text()
hj_basic.append(hj_num)
item_change = [pattern_1.sub('', item) for item in hj_basic]
# 话题部分
topic = bs0bj.findAll('li', {'class': 'topic-item'})
topic_num = len(topic)
hj_topic = [topic_num]
for i in range(topic_num):
word = topic[i].find('div', {'class': 'topic-item__content'}).find('p').get_text()
hj_topic.append(word)
price = topic[i].find('span', {'class': 'price'}).get_text()
price_change = pattern_1.sub('', price)
hj_topic.append(price_change)
return item_change, hj_topic
def get_comment(ite_):
comment = []
url = 'https://www.zaih.com/falcon/meet_api/v1/mentors/' + ite_ + '/comments?page=1&per_page=10'
# 评论总数,可在请求头中
file = urlopen(url).info()['x-total-count']
file = int(file)
comment.append(file)
# 获取每条评论
# XHR每次会返回10条评论,计算返回的次数
xhr_page = math.ceil(file / 10)
comment_url_p1 = 'https://www.zaih.com/falcon/meet_api/v1/mentors/' + ite_ + '/comments?page='
comment_url_p2 = '&per_page=10'
for i in range(xhr_page):
comment_url = comment_url_p1 + str(i+1) + comment_url_p2
html = requests.get(comment_url).json()
num_per_page = len(html)
for j in range(num_per_page):
hj_com = html[j]['comments']
hj_com = pattern_1.sub('', hj_com)
comment.append(hj_com)
a = html[j]['replies']
if len(a) != 0:
x = pattern_1.sub('', a[0]['content'])
comment.append(x)
else:
comment.append('无')
return comment
if __name__ == '__main__':
ite = '2bgdjpas3dh'
result_basic, result_topic = get_data(ite)
result_comment = get_comment(ite)