getCommentInfo.py:
from bs4 import BeautifulSoup
import requests
from mylog import MyLog as mylog
# 《Python 网络爬虫实战》胡松涛著 P196
class Item():
title = None
firstAuthor = None
firstTime = None
reNum = None
content = None
lastAuthor = None
lastTime = None
class GetTiebaInfo():
def __init__(self, url):
self.url = url
self.log = mylog()
self.pageSum = 1
self.urls = self.getUrls(self.pageSum)
self.items = self.spider(self.urls)
self.pipelines(self.items)
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100
## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0
## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50
## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100
## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150
## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=200
def getUrls(self, pageSum):
urls = []
pns = [str(i * 50) for i in range(pageSum)]
ul = self.url.split("=")
for pn in pns:
ul[-1] = pn
url = "=".join(ul)
print(url)
urls.append(url)
return urls
def spider(self, urls):
items = []
for url in urls:
htmlContent = self.getResponseContent(url)
soup = BeautifulSoup(htmlContent, 'lxml')
# 注意:这里前面有个空格
tagsli = soup.find_all('li', attrs={'class': ' j_thread_list clearfix'})
for tag in tagsli:
item = Item()
item.title = tag.find('a', attrs={'class': 'j_th_tit'}).get_text().strip()
# 注意,这里有一个 .a
item.firstAuthor = tag.find('span', attrs={'class': 'frs-author-name-wrap'}).a.get_text().strip()
item.firstTime = tag.find('span', attrs={'title': '创建时间'}).get_text().strip()
item.reNum = tag.find('span', attrs={'title': '回复'}).get_text().strip()
# 注意:这里后面有个空格
item.content = tag.find('div',
attrs={'class': 'threadlist_abs threadlist_abs_onlyline '}).get_text().strip()
item.lastAuthor = tag.find('span', attrs={'class': 'tb_icon_author_rely j_replyer'}).get_text().strip()
item.lastTime = tag.find('span', attrs={'title': '最后回复时间'}).get_text().strip()
items.append(item)
self.log.info('获取标题为《%s》的项成功 ...' % item.title)
return items
def pipelines(self, items):
fileName = '百度贴吧_python.txt'
with open(fileName, 'w', encoding='utf-8') as fp:
for item in items:
fp.write(
'title:{} \t author:{} \t firstTime:{} \ncontent:{} \n return:{} \n lastAuthor:{} \t lastTime:{} \n\n\n\n'
.format(item.title, item.firstAuthor, item.firstTime, item.content, item.reNum, item.lastAuthor,
item.lastTime))
def getResponseContent(self, url):
try:
response = requests.get(url)
except:
self.log.error('Python 返回 URL:%s 数据失败' % url)
else:
self.log.info('Python 返回 URL:%s 数据成功' % url)
return response.text
if __name__ == '__main__':
url = 'http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50'
GTI = GetTiebaInfo(url)
mylog.py
import logging
import getpass
import sys
class MyLog():
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
self.logFile = sys.argv[0][0:-3] + '.log'
self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
self.logHand = logging.FileHandler(self.logFile, encoding='utf8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)
self.logHandst = logging.StreamHandler()
self.logHandst.setFormatter(self.formatter)
self.logHandst.setLevel(logging.DEBUG)
self.logger.addHandler(self.logHand)
self.logger.addHandler(self.logHandst)
# 日志的 5 个级别对应一下的 5 个函数
def debug(self, msg):
self.logger.debug(msg)
def info(self, msg):
self.logger.info(msg)
def warn(self, msg):
self.logger.warn(msg)
def error(self, msg):
self.logger.error(msg)
def critical(self, msg):
self.logger.critical(msg)
if __name__ == '__main__':
mylog = MyLog()
mylog.debug(u"I'm debug 测试中文")
mylog.info("I'm info")
mylog.warn("I'm warm")
mylog.error(u"I'm error 测试中文")
mylog.critical("I'm critical")