python爬取虎扑评论_python-虎扑爬虫

Python作为一个高级编程语言,不知从何时起就在圈子里流行起来了。个人也是图个鲜,跟上时代步伐学习了一下。“鲁迅”说过:不能学以致用,就是耍流氓。我用python对虎扑论坛作了一个爬虫。脚本写的糙了点,权作初学者交流使用,同时也方便以后查阅。本来是准备写一个虎扑的分析帖子,可后来动力不足就没有写成了。不过,作为一个马刺球迷很荣幸我们的组织是热度前三。

准备工作:安装Python、安装MySQL、虚拟机【选择性,后期将每日放在服务器上执行定时任务使用】

1、安装python:选择3.*,过程忽略

2、安装MySQL:选择5.6版本及以上,过程忽略

3、虚拟机:linux系列,过程忽略

需求描述

爬取虎扑论坛帖子,了解帖子内容、作者、热度等。

写脚本

一共分为三部分:part1通过对当前链接进行分析,提取帖子作者、阅读的信息;part2取得帖子本身的内容;part3对发帖人进行数据提取,为后期分析提供思路。具体的脚本如下。需要注意的是:编码、编码、编码。谢谢!

注:由于虎扑的反爬虫导致可细分论坛的可读取页面数为10(突破防御失败,谢谢!)这种情况下,我的处理方式是将脚本放入服务器中每日爬取进行累积。

Part1:爬取帖子的名称、作者、创建时间、阅读/回复、作者链接等,并放入本地MySQL数据库

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup

import requests

import json

import time

import pymysql

import importlib,sys

importlib.reload(sys)

forum_note_sum=[] #variavle:save the content of tiezi

list_d=['原创','翻译','讨论'] #内容判断条件,如果帖子标题内容为此,取另一个值

type = sys.getfilesystemencoding()

#num:the record number of one page;get tiezi of author and others

def parent_li_web(num):

forum_note_record = {}

try:

parent_tiezi=bs_obj.find('ul',class_='for-list').find_all('li')[num]

div_one = parent_tiezi.find('div', class_='titlelink box')

div_two = parent_tiezi.find('div', class_='author box')

span_three = parent_tiezi.find('span', class_='ansour box').string.strip()

div_four = parent_tiezi.find('div', class_='endreply box')

subname=div_one.a.string

sublink='https://bbs.hupu.com'+div_one.a['href']

team_tmp=theme_tmp

for i in list_d:

if i==subname:

subname=div_one.find_all('a')[1].string

sublink='https://bbs.hupu.com'+div_one.find_all('a')[1]['href']

# print (i,subname,sublink)

forum_note_record.update({

'subname':subname,

'subname_link':sublink,

'author':div_two.a.string,

'author_link':div_two.a['href'],

'author_create_time':div_two.find('a',style='color:#808080;cursor: initial; ').string,

'read_reply_number':span_three,

'last_reply_writer':div_four.span.string,

'last_reply_time':div_four.a.string,

'team_tmp':team_tmp

})

forum_note_sum.append(forum_note_record)

except:

return None

if __name__=='__main__':

# all_spurs_note

begin_time=time.time()

print('---------脚本执行时间为:{}------------'.format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())))

team_list = ['rockets', 'warriors', 'cavaliers', 'spurs', 'lakers', 'celtics', 'thunder', 'clippers',

'timberwolves', 'mavericks', 'knicks', 'bulls', 'nets', 'sixers', 'jazz', 'pacers', 'blazers', 'heat',

'suns', 'grizzlies', 'wizards', 'pelicans', 'bucks', 'kings', 'raptors', 'nuggets', 'hawks', 'hornets',

'pistons', 'magic']

for li in team_list:

forum_note_sum_code=[]

theme_tmp=li

for i in range(1,11,1): #由于虎扑反爬,只能爬到10页;后续可放入linux中定时执行

url = 'https://bbs.hupu.com/{}-{}'.format(li,i)

print (url)

wb_string = requests.get(url)

bs_obj = BeautifulSoup(wb_string.content, 'html.parser')

with open('web_spider_original.txt','w',encoding='utf8') as f:

f.write(str(bs_obj))

f.write('\r'*10+'-----我是分割线-----'+'\r'*10)

for j in range(1,61,1): #每个页面数据有60个帖子

parent_li_web(j)

with open('hupu_spider_spurs_load.txt', 'w', encoding='utf8') as f:

for item in forum_note_sum:

json.dump(item,f,ensure_ascii=False)

f.write('\r')

#insert into mysql

conn=pymysql.connect(host='localhost',user='root',passwd='1234',db='spider',port=3306,charset='utf8')

cur=conn.cursor()

cur.execute('delete from hupuforum_spurs_note_daytmp')

with open('hupu_spider_spurs_load.txt','r',encoding='utf8') as f:

for item in f:

item=json.loads(item) #how convert string to dict

# print(type(item))

cur.execute('insert into hupuforum_spurs_note_daytmp(subname,subname_link,author,author_link,author_create_time,read_reply_number,last_reply_writer,last_reply_time,theme_title) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',(item['subname'],item['subname_link'],item['author'],item['author_link'],item['author_create_time'],item['read_reply_number'],item['last_reply_writer'],item['last_reply_time'],item['team_tmp']))

conn.commit()

cur.close()

conn.close()

print('Finished!本次执行消耗时间为:{}秒'.format(time.time()-begin_time))

Part2:增加贴子内容并更新部分字段

#coding=utf8

importtimeimportrequestsfrom bs4 importBeautifulSoupimportpymysqlimportsignal

begin_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())

conn=pymysql.connect(host='localhost',port=3306,user='root',passwd='1234',db='spider',charset='utf8')

cur=conn.cursor()

sub_cur=conn.cursor()

cur.execute('INSERT INTO hupuforum_spurs_note SELECT * FROM hupuforum_spurs_note_daytmp WHERE subname_link NOT IN (SELECT a.subname_link FROM hupuforum_spurs_note a);')

cur.execute('update hupuforum_spurs_note a,hupuforum_spurs_note_daytmp b set a.read_reply_number=b.read_reply_number,a.last_reply_writer=b.last_reply_writer,a.last_reply_time=b.last_reply_time where a.subname_link=b.subname_link')#conn.commit()

cur.execute('use spider;')

conn.commit()

cur.execute('select subname_link from hupuforum_spurs_note where sub_text is null;')for url incur.fetchall():

url=list(url)#print(url)

try:

wb_page= requests.get(url[0],timeout=2) #实际执行中,存在网页假死状态,设置超时

bs_obj = BeautifulSoup(wb_page.content, 'html.parser')

tmp_text= bs_obj.select('#tpc > div > div.floor_box > table.case > tbody > tr > td > div.quote-content')

sub_text=tmp_text[0].get_text(strip=True)

sub_text=sub_text.replace('\'','’')

sql="""update hupuforum_spurs_note set sub_text=\'{}\' where subname_link={};""".format((sub_text[:1000]),str(url).replace('[','').replace(']',''))#print(sql)

sub_cur.execute(sql)

conn.commit()print('success')except IndexError as e: #这个错误意味着页面也不存在

sql="""update hupuforum_spurs_note set sub_text=\'{}\' where subname_link={};""".format('网页不存在',str(url).replace('[','').replace(']',''))

sub_cur.execute(sql)

conn.commit()except pymysql.err.InternalError as e: #说明内容中包含emoj等utf8四字符内容

sql="""update hupuforum_spurs_note set sub_text=\'{}\' where subname_link={};""".format('内容格式有误,导致出错!',str(url).replace('[','').replace(']',''))

sub_cur.execute(sql)

conn.commit()except requests.exceptions.ReadTimeout as e: #网页响应超时

sql="""update hupuforum_spurs_note set sub_text=\'{}\' where subname_link={};""".format('网页打开超时',str(url).replace('[','').replace(']',''))

sub_cur.execute(sql)

conn.commit()else:

sql="""update hupuforum_spurs_note set sub_text=\'{}\' where subname_link={};""".format('其他类型错误',str(url).replace('[','').replace(']',''))

sub_cur.execute(sql)

conn.commit()

conn.commit()

cur.close()

sub_cur.close()

conn.close()

end_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())print('Finished,任务开始时间为:{},结束时间为:{}'.format(begin_time,end_time))

Part3:爬取注册用户信息

#coding=utf8

importtimeimportrequestsfrom bs4 importBeautifulSoupimportpymysql

begin_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())

conn=pymysql.connect(host='localhost',port=3306,user='root',passwd='1234',db='spider',charset='utf8')

cur=conn.cursor()

sub_cur=conn.cursor()

cur.execute('select distinct author_link from hupuforum_spurs_note;')for author_url incur.fetchall():try:

author_url=list(author_url)wb_obj=requests.get(author_url[0],timeout=2)

bs_obj=BeautifulSoup(wb_obj.content,'html.parser')

author=bs_obj.select('#main > div.personal > div.personal_right > h3 > div')[0].string

author_visited=bs_obj.select('#main > div.personal > div.personal_right > h3 > span')[0].string.replace('有','').replace('人次访问','')

author_info=bs_obj.select('#main > div.personal > div.personal_right > div')[0].get_text(strip=True)

sub_cur.execute('insert into hupuforum_authors_info(author,author_link,author_visited,author_info,author_status) values(%s,%s,%s,%s,%s)',(author,author_url[0],author_visited,author_info,'正常'))exceptIndexError as e:

sub_cur.execute('insert into hupuforum_authors_info(author,author_link,author_visited,author_info,author_status) values(%s,%s,%s,%s,%s)',

(author, author_url[0],'', '', '无法访问'))

conn.commit()

conn.commit()

cur.close()

conn.close()

end_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())print('Finished,任务开始时间为:{},结束时间为:{}'.format(begin_time,end_time))

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值