1、主程:
class SpiderMain():
def craw(self):
with open('baike.txt', 'a', encoding='utf-8') as f:
thread1 = spider_thread.MyThread(1, f)
thread2 = spider_thread.MyThread(2, f)
thread1.start()
thread2.start()
thread1.join()
thread2.join()
if __name__ == '__main__':
tim = time.time()
SpiderMain().craw()
print("耗时:%f" % (time.time() - tim))
2、线程:
class MyThread(threading.Thread):
head = 1
tail = 5
lock = threading.Lock()
baike = 'http://baike.baidu.com/view'
parser = html_parser.HtmlParser()
spider = html_spider.HtmlSpider()
def __init__(self, id, out):
threading.Thread.__init__(self)
self.id = id
self.out = out
def run(self):
while True:
MyThread.lock.acquire()
page = MyThread.head
MyThread.head += 1
MyThread.lock.release()
if page < MyThread.tail:
data = {}
url = MyThread.baike + '/' + str(page) + '.html'
try:
html = MyThread.spider.download(url)
data = MyThread.parser.parse(url, html)
except:
data['url'] = url
data['title'] = None
data['summary'] = None
print('Thread-%d: %s\t%s\t%s' % (self.id, data['url'], data['title'], data['summary']))
self.out.write('%s\t%s\t%s\n' % (data['url'], data['title'], data['summary']))
else:
break
3、爬取:
class HtmlSpider():
def download(self, url):
"""下载网页"""
if url is None:
return None
response = urllib.request.urlopen(url)
if response.getcode() != 200:
return None
return response.read()
4、解析:
class HtmlParser():
def _get_new_data(self, page_url, soup):
"""获取原始url、词条、摘要"""
res_data = {}
# 原始url
res_data['url'] = page_url
# <dd class="lemmaWgt-lemmaTitle-title"><h1>百度百科</h1></dd>
title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
res_data['title'] = title_node.get_text().strip()
# <div class="lemma-summary" label-module="lemmaSummary"></div>
summary_node = soup.find('div', class_='lemma-summary')
res_data['summary'] = ''.join(summary_node.get_text().split())
return res_data
def parse(self, page_url, html_cont):
"""解析html页面"""
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser')
# print(soup.prettify())
result = self._get_new_data(page_url, soup)
return result
点评:简单粗暴不失优雅,风骚曼妙轻松上手,奥利给!