学习了一段时间,就想找些网站实战一下,最近几天尝试着去爬一下百度贴吧(代码写的还是不够精炼,可能存在这样或者那样的问题,希望大家看到可以指正出来,不胜感激)
目标:爬取目标网站 回帖人ID,回帖人昵称,回帖内容和时间(网站:http://tieba.baidu.com/p/3522395718?pn=1)
import urllib.request
from urllib.request import urlopen
import csv
import json
import urllib.error
import time
import os
from bs4 import BeautifulSoup
def write_Data(contents):
csvFile = open('bdtb.csv', 'a', newline='', encoding='utf-8') # 创建一个新的csv文件
writer = csv.writer(csvFile)
writer.writerow(('author_Id','author_Name','content','reply_time'))
for each in contents:
writer.writerow(each)
csvFile.close()
def getPageInfo(pageNum):
url = 'http://tieba.baidu.com/p/3522395718?pn='+str(pageNum)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
bs = BeautifulSoup(html, 'lxml')
content_field=bs.select('div.l_post')
item=[]
for div in content_field:
data_field=json.loads(div.get('data-field'))#解析json文件
user_Id=data_field['author']['user_id']#获取用户ID
user_Name=data_field['author']['user_name']
content = div.select('div.d_post_content')[0]
date = data_field['content']['date']
item.append([user_Id,user_Name.strip(),content.text.strip(),date.strip()])#将空白部分去除
# print(item)
return item
def deleteOldTxt():
filename = 'bdtb.csv'
if os.path.exists(filename):
os.remove(filename)
print('\n发现旧名单,已删除\n采集开始\n')
deleteOldTxt()
#删除已有名单
for i in range(1, 36):
print('正在写入第' + str(i) + '页数据')
contents = getPageInfo(i)
print(contents)
write_Data(contents)
time.sleep(3)
# hjson=json.loads(html)
# print(hjson['data-field']['date'])
# bs=BeautifulSoup(html,'lxml')
# contents=bs.select('li.d_name a.p_author_name')
# for content in contents:
# print(content.text)
结果是可以运行出来的
后来,我又尝试着去爬其他帖子,发现有的用户有ID,有的用户没有ID,然后又对代码进行了一些修改:
#-*-coding:utf8-*-
import urllib.request
from urllib.request import urlopen
import urllib.error
import re
import json
import csv
import os
import time
from bs4 import BeautifulSoup
#定义百度贴吧爬虫类
class BDTB:
def __init__(self):
self.pageNum=1
self.baseUrl=baseUrl
#传入某一页索引,获取页面代码
def getPage(self,pageNum):
try:
url=self.baseUrl+'?pn='+str(pageNum)
request=urllib.request.Request(url)
response=urllib.request.urlopen(request)
html=response.read().decode('utf-8')
# print(html)
return html
except urllib.error.URLError as e:
if hasattr(e,'reason'):
print(u'连接百度贴吧失败,错误原因:',e.reason)
return None
#BeautifulSoup方法
def getPageInfo(self,pageNum,html):
bs=BeautifulSoup(html,'lxml')
content_field = bs.select('div.l_post')#获取想要爬的div
# print(content_field)
item=[]
for div in content_field:
data_field = json.loads(div.get('data-field'))#解析json文件
ID= data_field['author']
if 'user_id' in ID:
ID= data_field['author']['user_id']
else:
ID=''
#有的用户有Id,有的用户没有Id,需要进行判断(python3中将字典的has_key属性换成了in需要注意)
user_Name = data_field['author']['user_name']
#ID=(datafield[author][userid] if 'userid' in ID else '')(四到七行可以改成dataframe格式)
content = div.select('div.d_post_content')[0]
date = data_field['content']['date']
item.append([ID,user_Name.strip(), content.text.strip(), date.strip()])
#将数据添加到列表中
# print(item)
return item
def write_Data(self,contents):
csvFile=open('mayday.csv','a',newline='',encoding='utf-8')#创建一个新的csv文件
writer=csv.writer(csvFile)
writer.writerow(('user_Id','author_Name','content','reply_time'))
for item in contents:
writer.writerow(item)
csvFile.close()
def deleteOldTxt(self):
filename='mayday.csv'
if os.path.exists(filename):
os.remove(filename)
print('发现旧名单,已删除')
def start(self):
print('正在读取帖子内容:')
IndexPage=self.getPage(1)
# pageInfo=self.getPageInfo(IndexPage)
try:
for i in range(1,37):
print('正在写入第'+str(i)+'页内容')
page=self.getPage(i)
html = spider.getPage(i)
#如果没有HTML会报错
contents=self.getPageInfo(page,html)
print(contents)
self.write_Data(contents)
time.sleep(2)
except IOError as e :
print('写入异常,原因:'+e.message)
finally:
print('写入任务完成')
baseUrl='http://tieba.baidu.com/p/708098062'
spider=BDTB()
spider.deleteOldTxt()
spider.start()
这个程序也是可以爬出来的