from bs4 importBeautifulSoupimportlogging
importsys
reload(sys)
sys.setdefaultencoding("utf-8")classItem(object):
title= None #帖子标题
firstAuthor = None #帖子原作者
firstTime = None #帖子创建时间
reNum = None #帖子回复浏览数量
LastTime = None #帖子最后回复时间
LastAuthor = None #帖子最后回复作者
link = None #帖子链接
#全局方法获取网页内容
defgetResponseContent(url):try:
response= urllib2.urlopen(url.encode('utf8'),timeout=20)except:
logging.error(u'Python返回URL:{}数据失败'.format(url))else:
logging.info(u'Python返回URL:{}数据成功'.format(url))returnresponse.read()classgetHupuInfo(object):def __init__(self,url):
self.url=url
self.pageSum= 3
self.urls =self.getUrls(self.pageSum)
self.items=self.spider(self.urls)
self.pipelines(self.items)defgetUrls(self,pageSum):
urls=[]
urls.append(self.url)for pn in range(1,pageSum):
tempurl= self.url + '-'+ str(pn+1)
urls.append(tempurl)
logging.info(u'获取URLS成功!\n')returnurlsdefspider(self,urls):
items=[]for url inurls:
htmlContent=getResponseContent(url)
soup= BeautifulSoup(htmlContent,'lxml')
tagtable= soup.find('table',attrs={'id':'pl'})
tagstr= tagtable.find_all('tr')
flag= 0
for tag intagstr:if flag ==0:
flag+=1
continue
else:
flag+= 1item=Item()
item.link= '/'+ tag.get('mid') + '.html'
item.title = tag.find('td', attrs={'class': 'p_title'}).find('a',href = item.link).get_text()
item.firstAuthor = tag.find('td', attrs={'class': 'p_author'}).a.get_text()
item.firstTime= tag.find('td', attrs={'class': 'p_author'}).get_text()
item.reNum= tag.find('td', attrs={'class': 'p_re'}).get_text()
item.LastAuthor= tag.find('td', attrs={'class': 'p_retime'}).a.get_text()
item.LastTime= tag.find('td', attrs={'class': 'p_retime'}).get_text()
items.append(item)
logging.info(u'获取帖子成功')returnitemsdefpipelines(self,items):
fileName= u'Hupu_bxj.txt'with open(fileName,'w') as fp:for item initems:#fp.write('{}\t{}\t{}\t{}\t{}\t{}\n{}\n\n'.format(item.title,item.firstAuthor,item.firstTime,item.reNum,item.LastAuthor,item.LastTime,item.link))
fp.write('{}\n'.format(item.title).encode('utf8'))
logging.info(u'写入文本成功')defgetpiclink(self):
piclink=[]for item inself.items:
piclink.append(self.url[0:20] +item.link)
logging.info(u'返回图片帖子链接成功')returnpiclinkclasspicInfo(object):def __init__(self,links):
self.links=links
self.imgurls=[]
self.spider()
self.pipeline()defspider(self):if self.links ==None:
logging.error('无图片链接')else:for link inself.links:
htmlContent=getResponseContent(link)
soup= BeautifulSoup(htmlContent,'lxml')
tagDiv= soup.find('div',attrs={'id':'tpc'})
img= tagDiv.find('div',attrs={'class':'quote-content'}).find_all('img')if img ==None:continue
else:for subimg inimg:
if subimg.get('data-original') ==None:
imgurl= subimg.get('src')else:
imgurl= subimg.get('data-original')
self.imgurls.append(imgurl)
logging.info(u'获取图片链接成功')defpipeline(self):for i inrange(len(self.imgurls)):
if self.imgurls[i][-3:] == 'png':
imgname= str(i) + '.png'
elif self.imgurls[i][-3:] == 'jpg':
imgname= str(i) + '.jpg'
elif self.imgurls[i][-4:] == 'jpeg':
imgname= str(i) + '.jpeg'
elif self.imgurls[i][-3:] == 'gif':
imgname= str(i) + '.jpeg'
else:continueimg=getResponseContent(self.imgurls[i])
with open (imgname,'ab') as fp:
fp.write(img)
logging.info(u'写入图片成功')if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
url= u'https://bbs.hupu.com/bxj'HUPU=getHupuInfo(url)
picurls=HUPU.getpiclink()
PIC= picInfo(picurls)