python 网络爬虫爬取落网 期刊内容 下载图片 音乐

1.实现方案
   1.1 采用多进程创建多个爬虫对象 爬虫对象主要由获取网页内容,分析网页内容,下载图片,下载音乐 这四个线程组成
   1.2 进程及线程数根据网络情况设置
   1.3 图片保存在img下 音乐放在music下以期刊命名 内容存放在result.txt中
2.代码
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import re
import string
import sys
import os
import threading
import requests
import queue
import time
import multiprocessing

class spider:
	
	def __init__(self,path):
		#url 队列
		self.queUrl = queue.Queue()
		#网页内容 队列
		self.quePageInfo = queue.Queue()
		#保存爬取结果		
		self.f = open(path, "w+")
		#爬取线程线程
		self.threads = []
		#写文件锁
		self.mu = threading.Lock()
		#下载图片url队列
		self.queImg = queue.Queue()
		#下载音乐队列
		self.queMusic = queue.Queue()
		#初始化下载路径
		self.DownLoadPath()
	
	#提取网页包含链接
	def GetUrl(self,page):
		regular = "href=\"([^\"]*)"
		pattern = re.compile(regular)
		result = pattern.findall(page)
		for i in result:
			self.queUrl.put(i)

	#给定url放入抓取队列
	def SetCapUrlQueue(self,url):
		self.queUrl.put(url)

	#创建图片保存以及音乐下载目录
	def DownLoadPath(self):
		#下载图片目录
		self.pathImg = "./img"
		#下载音乐目录
		self.pathMusic = "./music"
		#音乐下载链接
		self.musicUrl = "http://mp3-cdn2.luoo.net/low/luoo/radio"
		#不存在创建
		isExists=os.path.exists(self.pathImg)
		if not isExists:
			try:
				os.makedirs(self.pathImg)
			except Exception:
				print ("create",self.pathImg,"err")

		isExists=os.path.exists(self.pathMusic)
		if not isExists:
			try:
				os.makedirs(self.pathMusic)
			except Exception:
				print ("create",self.pathMusic,"err")
			
	#获取网页放入队列
	def GetPage(self):
		while not self.queUrl.empty():
			url = self.queUrl.get()
			r = requests.get(url)
			a = [url,r.text]
			self.quePageInfo.put(a)

	#下载图片
	def DownLoadImg(self):
		while True:
			while not self.queImg.empty():
				img = self.queImg.get()
				#文件存在不下载
				path = self.pathImg + "/" + img[1]
				isExists = os.path.exists(path)
				if not isExists:
					try:
						r = requests.get(img[0])
					except Exception:
						print ("get err")
						continue
					else:
						if r.status_code == 200:
							open(path, 'wb').write(r.content)
						else:
							#GET 失败获取五次
							for i in range(0,5):
								r = requests.get(img[0])
								if r.status_code == 200:
									open(path, 'wb').write(r.content)
									break
								time.sleep(2)

			time.sleep(2)
	
	#下载音乐
	def DownLoadMusic(self):
		while True:
			while not self.queMusic.empty():
				music = self.queMusic.get()
				path = self.pathMusic + "/" + music[0] + "/"
				if '/' in music[1]:
					tmp = music[1].split('/')
					filePath = self.pathMusic + "/" + music[0] + "/" + tmp[0] + tmp[1] + ".mp3"
				else:
					filePath = self.pathMusic + "/" + music[0] + "/" + music[1] + ".mp3"
				
				#按期刊创建文件夹
				if self.mu.acquire(True):
					isExists = os.path.exists(path)
					if not isExists:
						os.makedirs(path)
					self.mu.release()
				
				#音乐不存在下载
				isExists = os.path.exists(filePath)
				if not isExists:
					try:
						r = requests.get(music[2])
					except Exception:
						print ("get err")
						continue
					else:
						if r.status_code == 200:
							print ("downLoad",music[2])
							open(filePath, 'wb').write(r.content)
						else:
							#GET 失败获取五次
							for i in range(0,5):
								newUrl = music[2].replace('/0','/')
								print ("redownLoad",newUrl)
								r = requests.get(newUrl)
								if r.status_code == 200:
									open(filePath, 'wb').write(r.content)
									break
								time.sleep(2)

			time.sleep(2)
		
	
	#创建下载音乐线程
	def CreateDownLoadMusicThread(self,num):
		for i in range(0,num):
			t = threading.Thread(target=self.DownLoadMusic,args=())
			self.threads.append(t)

	#创建获取网页信息线程
	def CreateGetPageThread(self,num):
		for i in range(0,num):
			t = threading.Thread(target=self.GetPage,args=())
			self.threads.append(t)
	
	#创建分析网页内容线程
	def CreatePsrPageThread(self,num):
		for i in range(0,num):
			t = threading.Thread(target=self.PsrPage,args=())
			self.threads.append(t)
	
	#创建图片下载线程
	def CreateDownLoadImgThread(self,num):
		for i in range(0,num):
			t = threading.Thread(target=self.DownLoadImg,args=())
			self.threads.append(t)

	#启动线程
	def Run(self):
		for t in self.threads:
			t.setDaemon(True)
			t.start()
		t.join()

	#取出网页内容队列分析
	def PsrPage(self):
		while True:
			while not self.quePageInfo.empty():		
				a = self.quePageInfo.get()
				#提取图片链接
				regular  = "(\<img src=\"(http:\/\/img-cdn2.luoo.net\/pics\/vol\/([^\!]*)![^\"]*))|"
				#提取描述
				regular += "(<meta name=\"description\" content=\"([^\"]*))|"
				#提取音乐主题
				regular += "(<meta name=\"keywords\" content=\"([^\"]*))|"
				#提取期刊编号
				regular += "(vol-number rounded\"\>([^\<]*))|"
				#提取期刊标题
				regular += "(vol-title\"\>([^\<]*))|"
				#提取音乐
				regular += "(trackname btn-play\"\>([^\<]*))"
				pattern = re.compile(regular)
				result = pattern.findall(a[1])
				if len(result)<10:
					continue
				
				i = 0
				first = 0
				content = a[0] + '\n'
				imgName = ""
				music = ""

				for tmp in result:
					if (i == 0):
						#描述
						content += tmp[4] + '\n'	
					elif (i == 1):
						#音乐主题
						content += "@mark " + tmp[6] + '\n'
					elif (i == 2):
						#期刊编号
						music = str(int(tmp[8]))
						content += "@vol  " + tmp[8] + '\n'
						imgName = tmp[8] + ".jpg"
					elif (i == 3):
						#期刊标题
						content += "@tip  " + tmp[10] + '\n'
					elif (tmp[0] != ''):
						first = first + 1
						#第一张图片为封面
						if(first == 1):
							#提取图片链接 图片名称
							img = [tmp[1],imgName]
							self.queImg.put(img)
							content += "@img  " + imgName + '\n'
							content += "@music\n"
					else:
						#音乐名
						content += "      " + tmp[12] + '\n'
						#保存音乐下载链接
						s = tmp[12].split('.')
						path = self.musicUrl + music + "/" + s[0] + ".mp3"
						info = [music,tmp[12],path]
						self.queMusic.put(info)
					i = i + 1

				#获取锁写文件
				if self.mu.acquire(True):
					self.f.write(content)
					self.mu.release()
			time.sleep(2)
	
	
	#关闭文件退出
	def Quit(self):
		self.f.close()

def worker(num):
	path = 'result' + str(num) + '.txt'
	Luo = spider(path)
	avg = 250
	num = num*avg + avg
	for i in range(num-avg,num):
        	content = "http://www.luoo.net/music/"
        	if i < 10:
        		url = content +  "00" + str(i)
        	elif i < 100:
        		url = content +  "0" + str(i)
        	else:
        		url = content + str(i)
        	Luo.SetCapUrlQueue(url)
	Luo.CreateGetPageThread(1)
	Luo.CreatePsrPageThread(1)
	Luo.CreateDownLoadImgThread(1)
	Luo.CreateDownLoadMusicThread(1)
	Luo.Run()

#创建进程	
def RunSpider(num):
	for i in range(0, num):
		p = multiprocessing.Process(target = worker, args=(i,))
		p.start()
	
if __name__ == '__main__':
	RunSpider(1)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值