声明:转载请在本文评论中标明转载发布地址
本人博客欢迎查看点击打开链接
大版本更新预告:
- 增加评论迭代更新,不再错任何一篇新闻任何一个评论
2018年5月18日21:10:55更新:
新增功能:
- 能在指定页面id范围内批量爬取
- 预期功能已实现
下版本增加功能:
- 启用代理ip爬取,降低被检查的风险
- 增加请求头
已知bug:
- 评论可能没有1楼,导致可能出错
版本2.0
import requests
from bs4 import BeautifulSoup
import leancloud
# import logging
# logging.basicConfig(level=logging.DEBUG)
#请输入您的leancloud,如果不需要上传到云端,请关闭上传函数
leancloud.init("2C2xis80wMsTyMrkyi1cQIxG-gzGzoHsz", "sYMaOVyBxA81KTXMXQYwDDIg")
url = 'http://dyn.ithome.com/ithome/getajaxdata.aspx' # IT之家评论信息请求地址
news_id =str(0)
urlpage = 'http://dyn.ithome.com/comment/' + news_id
def get_news_hash(news_id): # 得到文章的hash值
'''hash值不是js计算得出的而是在评论页面源代码最后几行,它被设置成隐藏属性'''
url_gethash = urlpage
html_home = requests.get(url=url_gethash)
'''理网页源代码信息,筛选出hash'''
tree = BeautifulSoup(html_home.text,'html.parser')
news_hash = tree.find('input',attrs={'id':'hash'})['value']
return news_hash
def getpage_commentinfo(page_url): #输入文章url,输出用户列表
to_braak = False
all_comment = []
for i in range(1,6666) : # 调整页面
data_page = { # 发送的数据包
'newsID':news_id,
'hash':get_news_hash(news_id),
'type':'commentpage',
'page':str(i),
'order':'false'
}
'''读出一页中所有用户信息'''
page = requests.post(url=url, data=data_page).text
html = BeautifulSoup(page,'html.parser')
user_allmember = html.find_all('li', attrs={'class': 'entry'}) # 筛选网页信息,得到用户对象列表,列表里对应用户所有信息源码
'''循环读取每一个用户对象的信息加载到字典中,再把字典加载到列表里去'''
for x in range(0,6666) :
long = len(user_allmember) # 计算用户对象个数
if long == 0:
to_braak = True
break
user_infor = user_allmember[x] #取一个用户对象
'''id 等级 姓名 评论 赞同数 反对数 设备信息 楼层 地址 时间 app版本 新闻id'''
user_allinfo = {}
user_allinfo['user_id'] = user_infor.div.a['title'].replace('软媒通行证数字ID:', '')
user_allinfo['user_level'] = user_infor.div.find('div', attrs={'class': 'level'}).span.string.replace('Lv.', '')
user_allinfo['user_name'] = str(user_infor.find('span', attrs={'class': 'nick'}).string)
try:
user_allinfo['user_comment'] = user_infor.find('div',attrs={'class':'comm'}).p.get_text()
except:
user_allinfo['user_comment'] = 'None'
user_allinfo['user_comment_praise'] = user_infor.find('a', attrs={'class': 's'}).string.replace('支持(', '').replace(')', '')
user_allinfo['user_comment_oppose'] = user_infor.find('a', attrs={'class': 'a'}).string.replace('反对(', '').replace(')', '')
try:
user_allinfo['user_dev'] = user_infor.find('a',attrs={'href':'//m.ithome.com/ithome/download/'}).string
except:
user_allinfo['user_dev'] = 'None'
user_allinfo['user_floor'] = user_infor.find('strong', attrs={'class': 'p_floor'}).string.replace('楼', '')
try:
temp = user_infor.find('div',attrs={'class':'nmp'}).find('span',attrs={'class':'posandtime'})\
.string.replace('\xa0', ' ').replace('IT之家', '').replace('网友', '')
temp = temp.split(' ')
user_allinfo['user_address'] = temp[0]
user_allinfo['user_time'] = temp[1]+' '+temp[2]
except:
user_allinfo['user_address'] = 'None'
try:
user_allinfo['user_app'] = user_infor.find('span', attrs={'class': 'mobile android'}).a['title'].replace('App版本:v', '')
except:
user_allinfo['user_app'] = 'None'
user_allinfo['user_news_id'] = news_id
all_comment.append(user_allinfo)
# all_user =
print('已抓取'+str(user_allinfo['user_floor'])+'楼')
if x == long-1: #自动连接每一页
break
if to_braak == True:
break
if int(user_allinfo['user_floor'])== 1 :
break
return all_comment
def run_write_page_comment(page_commentinfo): #输入用户列表,把信息写到文件中去
page_comment = page_commentinfo
with open('评论信息.txt','a',encoding='utf-8') as f:
# f.write('id☆等级☆姓名☆评论☆赞同数☆反对数☆设备信息☆楼层☆地址☆时间☆app版本☆新闻id'+'\n')
for i in range(0,len(page_comment)):
try:
f.write(page_comment[i]['user_id']+'☆'+page_comment[i]['user_level']+'☆'+page_comment[i]['user_name']+'☆'+
page_comment[i]['user_comment']+'☆'+page_comment[i]['user_comment_praise']+'☆'+page_comment[i]['user_comment_oppose']
+'☆'+page_comment[i]['user_dev']+'☆'+page_comment[i]['user_floor']+'☆'
+str(page_comment[i]['user_address'])+'☆'+str(page_comment[i]['user_time'])+'☆'+str(page_comment[i]['user_app'])
+'☆'+str(page_comment[i]['user_news_id'])+'\n')
except:print('抓取错误的楼层'+page_comment[i]['user_floor']+'\n')
if i==len(page_comment)-1 :
print('抓取成功')
def run_update(page_commentinfo): #输入用户信息列表,上传用户信息
# 每天只有3万的api请求数,请注意api请求量,一条评论信息就是一个请求量
print('正在上传评论到云端')
for member in page_commentinfo:
comment = leancloud.Object.extend('comment')
comment = comment()
a = comment.set('user_id',int(member['user_id']))
comment.set('user_level',int(member['user_level']))
comment.set('user_name', member['user_name'])
comment.set('user_comment', member['user_comment'])
comment.set('user_comment_praise', int(member['user_comment_praise']))
comment.set('user_comment_oppose', int(member['user_comment_oppose']))
comment.set('user_dev', member['user_dev'])
comment.set('user_floor', int(member['user_floor']))
comment.set('user_address', member['user_address'])
comment.set('user_time', member['user_time'])
comment.set('user_app', member['user_app'])
comment.set('user_news_id', int(member['user_news_id']))
comment.save()
print('成功上传'+str(len(page_commentinfo))+'条评论到云端')
with open('评论信息.txt','a',encoding='utf-8') as f:
f.write('id☆等级☆姓名☆评论☆赞同数☆反对数☆设备信息☆楼层☆地址☆时间☆app版本☆新闻id'+'\n')
for i in range(208073,310000) :
try:
news_id = str(i)
print('正在爬新闻ID为'+news_id)
urlpage = 'http://dyn.ithome.com/comment/' + news_id
# run_update(getpage_commentinfo(urlpage))
run_write_page_comment(getpage_commentinfo(urlpage))
except:
pass
print('任务完成')
2018年5月18日18:23:45更新:
新增功能:
- 增加上传云数据库功能
已知bug:
修复无法获取设备信息bug导入xlsx时会导致一些用户信息异常是由分割符#引起的(评论中可能有#)
以上bug已在2.0中修复
版本1.0
import requests
from bs4 import BeautifulSoup
import leancloud
# import logging
#
# logging.basicConfig(level=logging.DEBUG)
leancloud.init("api id", "api key")
news_id =input("请输入网站id:")
urlpage = 'http://dyn.ithome.com/comment/' + news_id
url = 'http://dyn.ithome.com/ithome/getajaxdata.aspx' # IT之家评论信息请求地址
def get_news_hash(news_id): # 得到文章的hash值
'''hash值不是js计算得出的而是在评论页面源代码最后几行,它被设置成隐藏属性'''
url_gethash = urlpage
html_home = requests.get(url=url_gethash)
'''理网页源代码信息,筛选出hash'''
tree = BeautifulSoup(html_home.text)
news_hash = tree.find('input',attrs={'id':'hash'})['value']
return news_hash
'''第一页评论不包括热评'''
data_hot = {
'newsID':news_id,
'pid':'0',
'type':'hotcomment'
}
def getpage_commentinfo(page_url): #输入文章url,输出用户列表
all_comment = []
for i in range(1,6666) : # 调整页面
data_page = { # 发送的数据包
'newsID':news_id,
'hash':get_news_hash(news_id),
'type':'commentpage',
'page':str(i),
'order':'false'
}
'''读出一页中所有用户信息'''
page = requests.post(url=url, data=data_page).text
html = BeautifulSoup(page,'html.parser')
user_allmember = html.find_all('li', attrs={'class': 'entry'}) # 筛选网页信息,得到用户对象列表,列表里对应用户所有信息源码
'''循环读取每一个用户对象的信息加载到字典中,再把字典加载到列表里去'''
for x in range(0,6666) :
long = len(user_allmember) # 计算用户对象个数
user_infor = user_allmember[x] #取一个用户对象
'''id 等级 姓名 评论 赞同数 反对数 设备信息 楼层 地址 时间 app版本'''
user_allinfo = {}
user_allinfo['user_id'] = user_infor.div.a['title'].replace('软媒通行证数字ID:', '')
user_allinfo['user_level'] = user_infor.div.find('div', attrs={'class': 'level'}).span.string.replace('Lv.', '')
user_allinfo['user_name'] = str(user_infor.find('span', attrs={'class': 'nick'}).string)
try:
user_allinfo['user_comment'] = user_infor.find('div',attrs={'class':'comm'}).p.get_text()
except:
user_allinfo['user_comment'] = 'None'
user_allinfo['user_comment_praise'] = user_infor.find('a', attrs={'class': 's'}).string.replace('支持(', '').replace(')', '')
user_allinfo['user_comment_oppose'] = user_infor.find('a', attrs={'class': 'a'}).string.replace('反对(', '').replace(')', '')
try:
user_allinfo['user_dev'] = user_infor.find('a',attrs={'href':'//m.ithome.com/ithome/download/'}).string
except:
user_allinfo['user_dev'] = 'None'
user_allinfo['user_floor'] = user_infor.find('strong', attrs={'class': 'p_floor'}).string.replace('楼', '')
try:
temp = user_infor.find('div',attrs={'class':'nmp'}).find('span',attrs={'class':'posandtime'})\
.string.replace('\xa0', ' ').replace('IT之家', '').replace('网友', '')
temp = temp.split(' ')
user_allinfo['user_address'] = temp[0]
user_allinfo['user_time'] = temp[1]+' '+temp[2]
except:
user_allinfo['user_address'] = 'None'
try:
user_allinfo['user_app'] = user_infor.find('span', attrs={'class': 'mobile android'}).a['title'].replace('App版本:v', '')
except:
user_allinfo['user_app'] = 'None'
all_comment.append(user_allinfo)
# all_user =
print('已抓取'+str(user_allinfo['user_floor'])+'楼')
if x == long-1: #自动连接每一页
break
if int(user_allinfo['user_floor']) == 1:
break
return all_comment
def run_write_page_comment(page_commentinfo): #输入用户列表,把信息写到文件中去
page_comment = page_commentinfo
with open('评论信息.txt','a',encoding='utf-8') as f:
f.write('id#等级#姓名#评论#赞同数#反对数#设备信息#楼层#地址#时间#app版本'+'\n')
for i in range(0,len(page_comment)):
try:
f.write(page_comment[i]['user_id']+'#'+page_comment[i]['user_level']+'#'+page_comment[i]['user_name']+'#'+
page_comment[i]['user_comment']+'#'+page_comment[i]['user_comment_praise']+'#'+page_comment[i]['user_comment_oppose']
+'#'+page_comment[i]['user_dev']+'#'+page_comment[i]['user_floor']+'#'
+str(page_comment[i]['user_address'])+'#'+str(page_comment[i]['user_time'])+'#'+str(page_comment[i]['user_app'])+'\n')
except:print('抓取错误的楼层'+page_comment[i]['user_floor']+'\n')
if i==len(page_comment)-1 :
print('抓取成功')
def run_update(page_commentinfo): #输入用户信息列表,上传用户信息
for member in page_commentinfo:
comment = leancloud.Object.extend('comment')
comment = comment()
print(member['user_dev'])
a = comment.set('user_id',int(member['user_id']))
comment.set('user_level',int(member['user_level']))
comment.set('user_name', member['user_name'])
comment.set('user_comment', member['user_comment'])
comment.set('user_comment_praise', int(member['user_comment_praise']))
comment.set('user_comment_oppose', int(member['user_comment_oppose']))
comment.set('user_dev', member['user_dev'])
comment.set('user_floor', int(member['user_floor']))
comment.set('user_address', member['user_address'])
comment.set('user_time', member['user_time'])
comment.set('user_app', member['user_app'])
#每天只有3万的api请求数,请注意api请求量
comment.save()
run_update(getpage_commentinfo(urlpage))
# run_write_page_comment(getpage_commentinfo(urlpage))
已知bug:
无法获取安卓系统以外设备信息新闻可能没评论,导致程序出现异常获取用户信息错位
以上bug已在2.0中修复
import requests
from bs4 import BeautifulSoup
news_id =input("请输入网站id:")
urlpage = 'http://dyn.ithome.com/comment/' + news_id
url = 'http://dyn.ithome.com/ithome/getajaxdata.aspx' # IT之家评论信息请求地址
def get_news_hash(news_id): # 得到文章的hash值
'''hash值不是js计算得出的而是在评论页面源代码最后几行,它被设置成隐藏属性'''
url_gethash = urlpage
html_home = requests.get(url=url_gethash)
'''理网页源代码信息,筛选出hash'''
tree = BeautifulSoup(html_home.text)
news_hash = tree.find('input',attrs={'id':'hash'})['value']
return news_hash
'''第一页评论不包括热评'''
data_hot = {
'newsID':news_id,
'pid':'0',
'type':'hotcomment'
}
def getpage_commentinfo(page_url):
all_comment = []
for i in range(1,6666) : # 调整页面
data_page = { # 发送的数据包
'newsID':news_id,
'hash':get_news_hash(news_id),
'type':'commentpage',
'page':str(i),
'order':'false'
}
'''读出一页中所有用户信息'''
page = requests.post(url=url, data=data_page).text
html = BeautifulSoup(page,'html.parser')
user_allmember = html.find_all('li', attrs={'class': 'entry'}) # 筛选网页信息,得到用户对象列表,列表里对应用户所有信息源码
'''循环读取每一个用户对象的信息加载到字典中,再把字典加载到列表里去'''
for x in range(0,6666) :
long = len(user_allmember) # 计算用户对象个数
user_infor = user_allmember[x] #取一个用户对象
'''id 等级 姓名 评论 赞同数 反对数 设备信息 楼层 地址 时间 app版本'''
user_allinfo = {}
user_allinfo['user_id'] = user_infor.div.a['title'].replace('软媒通行证数字ID:', '')
user_allinfo['user_level'] = user_infor.div.find('div', attrs={'class': 'level'}).span.string.replace('Lv.', '')
user_allinfo['user_name'] = str(user_infor.find('span', attrs={'class': 'nick'}).string)
try:
user_allinfo['user_comment'] = user_infor.find('div',attrs={'class':'comm'}).p.get_text()
except:
user_allinfo['user_comment'] = 'None'
user_allinfo['user_comment_praise'] = user_infor.find('a', attrs={'class': 's'}).string.replace('支持(', '').replace(')', '')
user_allinfo['user_comment_oppose'] = user_infor.find('a', attrs={'class': 'a'}).string.replace('反对(', '').replace(')', '')
try:
user_allinfo['user_dev'] = user_infor.find('span',attrs={'class':'mobile android'}).string
except:
user_allinfo['user_dev'] = 'None'
user_allinfo['user_floor'] = user_infor.find('strong', attrs={'class': 'p_floor'}).string.replace('楼', '')
try:
temp = user_infor.find('div',attrs={'class':'nmp'}).find('span',attrs={'class':'posandtime'})\
.string.replace('\xa0', ' ').replace('IT之家', '').replace('网友', '')
temp = temp.split(' ')
user_allinfo['user_address'] = temp[0]
user_allinfo['user_time'] = temp[1]+' '+temp[2]
except:
user_allinfo['user_address_and_time'] = 'None'
try:
user_allinfo['user_app'] = user_infor.find('span', attrs={'class': 'mobile android'}).a['title'].replace('App版本:v', '')
except:
user_allinfo['user_app'] = 'None'
all_comment.append(user_allinfo)
# all_user =
print(str(user_allinfo['user_floor']))
if x == long-1: #自动连接每一页
break
if int(user_allinfo['user_floor']) == 1:
break
return all_comment
def run_write_page_comment(page_commentinfo):
page_comment = page_commentinfo
with open('评论信息.txt','w',encoding='utf-8') as f:
f.write('id#等级#姓名#评论#赞同数#反对数#设备信息#楼层#地址#时间#app版本'+'\n')
for i in range(0,len(page_comment)):
try:
f.write(page_comment[i]['user_id']+'#'+page_comment[i]['user_level']+'#'+page_comment[i]['user_name']+'#'+
page_comment[i]['user_comment']+'#'+page_comment[i]['user_comment_praise']+'#'+page_comment[i]['user_comment_oppose']
+'#'+page_comment[i]['user_dev']+'#'+page_comment[i]['user_floor']+'#'
+str(page_comment[i]['user_address'])+'#'+str(page_comment[i]['user_time'])+'#'+str(page_comment[i]['user_app'])+'\n')
except:print('抓取错误的楼层'+page_comment[i]['user_floor']+'\n')
if i==len(page_comment)-1 :
print('抓取成功')
run_write_page_comment(getpage_commentinfo(urlpage))