#_*_ coding=UTF-8 _*_
import requests
from bs4 import BeautifulSoup
import queue as Queue
import threading
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'}
link='https://bbs.hupu.com/bxj'
re=requests.get(link,headers=headers)
soup=BeautifulSoup(re.content,'lxml')
soup_list=soup.find('ul',class_='for-list')
work=Queue.Queue(130)
li_list=soup_list.find_all('li')
for lli in li_list:
work.put(lli)
class myThread(threading.Thread):
def __init__(self,name,q):
threading.Thread.__init__(self)
self.name=name
self.q=q
def run(self):
gLock.acquire()
print('starting '+self.name)
gLock.release()
while not self.q.empty():
try:
crawler(self.name,self.q)
except:
break
print('exit '+self.name)
gLock = threading.Lock()#申请锁
def crawler(threadName,q):
if gLock.acquire(1):#加锁,并判断是否得到了锁,得到了再继续
lli=q.get()
try:
title=lli.find('div',class_='titlelink box')
author=lli.find('div',class_='author box')
reply_view=lli.find('span',class_='ansour box')
reply_view=reply_view.text.strip().split('/')#重点:对形如23/34这样的字符串进行分割
endreply=lli.find('div',class_='endreply box')
print('标 题:'+title.a.string)#重点:NavigableString对象获取文本内容的方法
print('标题链接:'+"https://bbs.hupu.com"+title.a['href'])#重点:获取标签内链接的方法
print('作 者:'+author.a.string)
print('发布时间:'+author.contents[5].string)
print('回 复:'+reply_view[0].strip())#strip()不可省略,否则会有空格
print('浏 览:'+reply_view[1].strip())
print('最后回复时间:'+endreply.a.string)
print('最后回复链接:'+"https://bbs.hupu.com/"+endreply.a['href'])
print('最后回复人 :'+endreply.span.string)
print()
gLock.release()
except:
print(self.name,'ERROR:',e)
threadlist=['thread1','thread2','thread3','thread4','thread5']
threads=[]
for tname in threadlist:
thread=myThread(tname,work)
thread.start()
threads.append(thread)