python--CSDN爬虫

最新推荐文章于 2024-06-15 18:42:56 发布

呼的一声飞的好远好远

最新推荐文章于 2024-06-15 18:42:56 发布

阅读量460

点赞数

分类专栏： python 文章标签：爬虫 python threading queue

本文链接：https://blog.csdn.net/qq_36767247/article/details/82313660

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

# encoding:utf-8
__author__ = 'Sun'

import re
import urllib.request
import urllib
import queue
import threading
import os


queue = queue.Queue()
visited = set()
cnt = 0
class CsdnBlogSpider(threading.Thread):

        def __init__(self, queue, opener, blog_name):
                threading.Thread.__init__(self)
                self.queue = queue
                self.opener = opener
                self.blog_name = blog_name
                self.lock = threading.Lock()

        def htmltotxt(self, data, fout):
                html = data.read().decode("utf-8")  
                html.encode("utf-8")
 
                content=re.findall('<div style=".*?">(.*?)</div>',html,re.S)
                reg=re.compile('<[^>]*>')
                fout.write(fout)
                
        def save_data(self, data, filename):
		if not os.path.exists('blog'):
			blog_path = os.path.join(os.path.abspath('.'),'blog')
			os.mkdir(blog_path)
		try:
			fout = open('./blog/' + 'lalal' + '.txt', 'wb')
			htmltotxt(data,fout)
			#fout.write(data);
		except IOError as e:
			print(e)
		# finally:
		# 	fout.close()

	def find_title(self,data):
		data = data.decode('utf-8')
		begin = data.find(r'<title') + 7
		end = data.find('\r\n',begin)
		title = data[begin:end]
		return title

	def run(self):
                global cnt
		global visited
		while True:
			url = self.queue.get()
			self.lock.acquire()
			cnt += 1
			print('已经抓取：' + str(cnt-1) + '正在抓取---->' + url)
			self.lock.release()
			try:
				res = self.opener.open(url, timeout=1000)
			except Exception as e:
				if hasattr(e, 'reason'):
					print('reason:', e.reason)
				elif hasattr(e, 'code'):
					print('error code:', e.code)
				cnt -= 1
				self.queue.task_done()
				continue
			else:
				data = res.read()
			title = self.find_title(data)
			self.save_data(data,title)

			data = data.decode('utf-8')
			blog_urls = re.compile('/' + self.blog_name + '/article/details/' + '\d*')
			for url in blog_urls.findall(data):
				url = 'http://blog.csdn.net' + url
				if url not in visited:
					self.queue.put(url)
					visited |= {url}
					# print('加入队列---》' + url)
			#当调用一次get函数之后就调用此函数，让线程阻塞或退出
			self.queue.task_done()

def init(name, number=10):
        #抓取的url个数
	global cnt
	global visited
	# blog_name = input('输入博客名称:')
	# thread_num = input('输入启动线程数:')
	blog_name = name.lower()
	th_num = int(number)
	url = 'http://blog.csdn.net/' + blog_name + '/'

	#模仿浏览器访问CSDN
	opener = urllib.request.build_opener(urllib.request.HTTPHandler)
	headers = [
		('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko')
	]
	urllib.request.install_opener(opener)
	opener.addheaders = headers

	queue.put(url)
	visited |= {url}
	cnt = 0

	for i in range(th_num):
		t = CsdnBlogSpider(queue,opener,blog_name)
		t.setDaemon(True)
		t.start()
	queue.join()
	print('--------end!!!-----')
	print('共抓取:' + str(cnt))

if __name__ == '__main__':
	init("xiaoxiaoCYG")

呼的一声飞的好远好远

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python--CSDN爬虫

# encoding:utf-8__author__ = 'Sun'import reimport urllib.requestimport urllibimport queueimport threadingimport osqueue = queue.Queue()visited = set()cnt = 0class CsdnBlogSpider(th...
复制链接

扫一扫

专栏目录