python--CSDN爬虫

 

# encoding:utf-8
__author__ = 'Sun'

import re
import urllib.request
import urllib
import queue
import threading
import os


queue = queue.Queue()
visited = set()
cnt = 0
class CsdnBlogSpider(threading.Thread):

        def __init__(self, queue, opener, blog_name):
                threading.Thread.__init__(self)
                self.queue = queue
                self.opener = opener
                self.blog_name = blog_name
                self.lock = threading.Lock()

        def htmltotxt(self, data, fout):
                html = data.read().decode("utf-8")  
                html.encode("utf-8")
 
                content=re.findall('<div style=".*?">(.*?)</div>',html,re.S)
                reg=re.compile('<[^>]*>')
                fout.write(fout)
                
        def save_data(self, data, filename):
		if not os.path.exists('blog'):
			blog_path = os.path.join(os.path.abspath('.'),'blog')
			os.mkdir(blog_path)
		try:
			fout = open('./blog/' + 'lalal' + '.txt', 'wb')
			htmltotxt(data,fout)
			#fout.write(data);
		except IOError as e:
			print(e)
		# finally:
		# 	fout.close()

	def find_title(self,data):
		data = data.decode('utf-8')
		begin = data.find(r'<title') + 7
		end = data.find('\r\n',begin)
		title = data[begin:end]
		return title

	def run(self):
                global cnt
		global visited
		while True:
			url = self.queue.get()
			self.lock.acquire()
			cnt += 1
			print('已经抓取:' + str(cnt-1) + '正在抓取---->' + url)
			self.lock.release()
			try:
				res = self.opener.open(url, timeout=1000)
			except Exception as e:
				if hasattr(e, 'reason'):
					print('reason:', e.reason)
				elif hasattr(e, 'code'):
					print('error code:', e.code)
				cnt -= 1
				self.queue.task_done()
				continue
			else:
				data = res.read()
			title = self.find_title(data)
			self.save_data(data,title)

			data = data.decode('utf-8')
			blog_urls = re.compile('/' + self.blog_name + '/article/details/' + '\d*')
			for url in blog_urls.findall(data):
				url = 'http://blog.csdn.net' + url
				if url not in visited:
					self.queue.put(url)
					visited |= {url}
					# print('加入队列---》' + url)
			#当调用一次get函数之后就调用此函数,让线程阻塞或退出
			self.queue.task_done()

def init(name, number=10):
        #抓取的url个数
	global cnt
	global visited
	# blog_name = input('输入博客名称:')
	# thread_num = input('输入启动线程数:')
	blog_name = name.lower()
	th_num = int(number)
	url = 'http://blog.csdn.net/' + blog_name + '/'

	#模仿浏览器访问CSDN
	opener = urllib.request.build_opener(urllib.request.HTTPHandler)
	headers = [
		('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko')
	]
	urllib.request.install_opener(opener)
	opener.addheaders = headers

	queue.put(url)
	visited |= {url}
	cnt = 0

	for i in range(th_num):
		t = CsdnBlogSpider(queue,opener,blog_name)
		t.setDaemon(True)
		t.start()
	queue.join()
	print('--------end!!!-----')
	print('共抓取:' + str(cnt))

if __name__ == '__main__':
	init("xiaoxiaoCYG")

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
引用和引用[2]提供了关于Python控制台的一些信息。Python控制台是一个可以在控制台中运行的轻量级的蛇游戏。同时,通过Python浏览器控制台,你可以轻松获得实时运行的应用程序的交互式Python控制台。你可以使用git clone命令克隆存储库,然后按照相应的步骤安装和运行这些应用程序。 至于你提到的“python --console”,并没有提到具体的上下文,所以无法给出具体的回答。可以解释为在命令行中运行Python解释器,以便在控制台中使用Python编程和执行代码。这意味着你可以输入Python语句并查看结果,与Python交互。如果你有更具体的问题,可以进一步描述,这样我就可以给你更准确的回答了。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* [python-console-snake, 在控制台中,轻量级的蛇游戏运行.zip](https://download.csdn.net/download/weixin_38744207/11771319)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 33.333333333333336%"] - *2* [python-browser-console:轻松获得实时运行的应用程序的交互式 Python 控制台](https://download.csdn.net/download/weixin_42157188/19853071)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 33.333333333333336%"] - *3* [Python爬虫实战笔记-股票爬取示例.md](https://download.csdn.net/download/weixin_52057528/88258593)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 33.333333333333336%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值