今日头条街拍爬取

最新推荐文章于 2020-05-01 21:07:47 发布

浅汐王

最新推荐文章于 2020-05-01 21:07:47 发布

阅读量453

点赞数

分类专栏： python

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/qq_32252917/article/details/78174917

版权

python 专栏收录该内容

39 篇文章 0 订阅

订阅专栏

 
 本次练习使用到的知识点有 

 
 * Requests 库的使用 

 
 * BeautifulShop 库的使用 

 
 * 正则表达式的使用 

 
 * pymongo 库的使用 

 
 1、项目流程分析 

Clipboard Image.png

 
 2、中心调度 

 
 # 
 中心调度 

 
 def 
  
 main 
 (offset): 

 
 # 
 获取列表页

 
  
 index_data = get_page_index(offset,KEYWORDS) 

 
 if 
  
 index_data 
 is 
  
 None 
 :

 
  
 print 
 ( 
 "offset:" 
 +offset+ 
 " 
 异常 
 " 
 ) 

 
  
 return 

 
  
 # 
 解析列表页获取所有详情页的 
 url 

 
 for 
  
 url 
 in 
  
 parse_page_index(index_data):

 
 # 
 获取详情页

 
  
 detail_data = get_page_detail(url) 

 
 if 
  
 detail_data 
 is 
  
 None 
 :

 
  
 print 
 ( 
 'url:%s 
 异常 
 ' 
 .format(url)) 

 
  
 pass 

 
 # 
 解析详情页

 
  
 data = parse_page_detail(detail_data, url) 

 
 if 
  
 data 
 is 
  
 None 
 :

 
  
 continue 

 
  
 save_to_mongo(data) 

 
 3、请求和解析列表页 

 
 # 
 请求获取列表页的响应数据 

 
 def 
  
 get_page_index 
 (offset, 
 keywords 
 ): 

 
 params = { 

 
 'offset' 
 :offset,

 
  
 'format' 
 : 
 'json' 
 , 

 
 'keyword' 
 :KEYWORDS,

 
  
 'cur_tab' 
 : 
 3 
 , 

 
  
 'autoload' 
 : 
 'true' 
 , 

 
  
 'count' 
 : 
 20 

}

 
 try 
 :

 
 response = requests.get( 
 'http://www.toutiao.com/search_content/' 
 , 
 params 
 =params) 

 
 if 
  
 response.status_code== 
 200 
 :

 
 return 
  
 response.text

 
 return 
  
 None

 
 except 
  
 RequestException 
 as 
  
 e 
 :

 
 return 
  
 None

 
 # 
 解析列表页 

 
 def 
  
 parse_page_index 
 (text): 

 
 try 
 :

 
 data = json.loads(text) 

 
 if 
  
 data 
 and 
  
 'data' 
  
 in 
  
 data.keys():

 
  
 for 
  
 item 
 in 
  
 data.get( 
 'data' 
 ): 

 
 yield 
  
 item.get( 
 'article_url' 
 )

 
 except 
  
 JSONDecodeError 
 as 
  
 e 
 :

 
  
 print 
 ( 
 ' 
 解析异常 
 ' 
 ) 

 
 return 
  
 []

 
 4、请求和解析详情页 

 
 # 
 解析详情页面 

 
 def 
  
 parse_page_detail 
 (html, url): 

 
 soup = BeautifulSoup(html, 
 'lxml' 
 ) 

 
 # 
 获取页面的标题

 
  
 title = soup.title.string 

 
 image_pattern = re.compile( 
 'var gallery = (.*?);' 
 ,re.S) 

 
 result = image_pattern.search(html) 

 
 if 
  
 result:

 
 try 
 :

 
 data = json.loads(result.group( 
 1 
 )) 

 
 if 
  
 data 
 and 
  
 'sub_images' 
  
 in 
  
 data.keys():

 
  
 # 
 获取所有的 
 image 
 的 
 url 

 
  
 images = [item.get( 
 'url' 
 ) 
 for 
  
 item 
 in 
  
 data.get( 
 'sub_images' 
 )] 

 
 for 
  
 image 
 in 
  
 images:

 
 # 
 下载图片

 
  
 download_image(image) 

 
  
 return 
  
 { 
 'title' 
 :title, 
 'url' 
 :url, 
 'images' 
 :images} 

 
 except 
  
 JSONDecodeError 
 as 
  
 e 
 :

 
 return 
  
 None

 
 return 
  
 None

 
 5、下载图片和保存至Mongodb 

 
 # 
 获取图片的二进制流 

 
 def 
  
 download_image 
 (url): 

 
 try 
 :

 
  
 print 
 ( 
 ' 
 图片 
 ' 
 +url+ 
 ' 
 正在下载 
 ' 
 ) 

 
 response = requests.get(url) 

 
 if 
  
 response.status_code == 
 200 
 :

 
 # 
 保存图片

 
  
 save_image(response.content) 

 
 except 
  
 RequestException 
 as 
  
 e 
 :

 
  
 print 
 ( 
 ' 
 异常 
 image:' 
 +url) 

 
  
 pass 

 
 # 
 保存二进制流至文件 

 
 def 
  
 save_image 
 (content): 

 
 file_path = 
 '{0}/images/{1}.{2}' 
 .format(os.getcwd(), md5(content).hexdigest(),  
 'jpg' 
 ) 

 
 if 
  
 not 
  
 os.path.exists(file_path):

 
  
 with 
  
 open 
 (file_path, 
 'wb+' 
 ) 
 as 
  
 file: 

 
 file.write(content) 

 
 file.close() 

 
 def 
  
 save_to_mongo 
 (data): 

 
 if 
  
 db[MONGO_TABLE].insert(data):

 
  
 print 
 ( 
 ' 
 成功保存 
 ' 
 +data[ 
 'title' 
 ]) 

 
 return 
  
 True

 
 return 
  
 False

 
 6、完整代码 

 
 #!/usr/bin/python 

 
 # -*- coding: utf-8 -*- 

 
 import 
  
 os

 
 import 
  
 re

 
 import 
  
 requests

 
 import 
  
 pymongo

 
 import 
  
 json

 
 from 
  
 hashlib 
 import 
  
 md5

 
 from 
  
 bs4 
 import 
  
 BeautifulSoup

 
 from 
  
 setting 
 import 
  
 *

 
 from 
  
 requests.exceptions 
 import 
  
 RequestException

 
 from 
  
 json.decoder 
 import 
  
 JSONDecodeError

 
 from 
  
 multiprocessing 
 import 
  
 Pool

 
 client = pymongo.MongoClient(MONGO_URL) 

 
 db = client[MONGO_DB] 

 
 # 
 请求获取列表页的响应数据 

 
 def 
  
 get_page_index 
 (offset, 
 keywords 
 ): 

 
 params = { 

 
 'offset' 
 :offset,

 
  
 'format' 
 : 
 'json' 
 , 

 
 'keyword' 
 :KEYWORDS,

 
  
 'cur_tab' 
 : 
 3 
 , 

 
  
 'autoload' 
 : 
 'true' 
 , 

 
  
 'count' 
 : 
 20 

}

 
 try 
 :

 
 response = requests.get( 
 'http://www.toutiao.com/search_content/' 
 , 
 params 
 =params) 

 
 if 
  
 response.status_code== 
 200 
 :

 
 return 
  
 response.text

 
 return 
  
 None

 
 except 
  
 RequestException 
 as 
  
 e 
 :

 
 return 
  
 None

 
 # 
 解析列表页 

 
 def 
  
 parse_page_index 
 (text): 

 
 try 
 :

 
 data = json.loads(text) 

 
 if 
  
 data 
 and 
  
 'data' 
  
 in 
  
 data.keys():

 
  
 for 
  
 item 
 in 
  
 data.get( 
 'data' 
 ): 

 
 yield 
  
 item.get( 
 'article_url' 
 )

 
 except 
  
 JSONDecodeError 
 as 
  
 e 
 :

 
  
 print 
 ( 
 ' 
 解析异常 
 ' 
 ) 

 
 return 
  
 []

 
 # 
 请求获取详情页面的响应数据 

 
 def 
  
 get_page_detail 
 (url): 

 
 response = requests.get(url) 

 
 try 
 :

 
 if 
  
 response.status_code== 
 200 
 :

 
 return 
  
 response.text

 
 return 
  
 None

 
 except 
  
 RequestException 
 as 
  
 e 
 :

 
 return 
  
 None

 
 # 
 解析详情页面 

 
 def 
  
 parse_page_detail 
 (html, url): 

 
 soup = BeautifulSoup(html, 
 'lxml' 
 ) 

 
 # 
 获取页面的标题

 
  
 title = soup.title.string 

 
 image_pattern = re.compile( 
 'var gallery = (.*?);' 
 ,re.S) 

 
 result = image_pattern.search(html) 

 
 if 
  
 result:

 
 try 
 :

 
 data = json.loads(result.group( 
 1 
 )) 

 
 if 
  
 data 
 and 
  
 'sub_images' 
  
 in 
  
 data.keys():

 
  
 # 
 获取所有的 
 image 
 的 
 url 

 
  
 images = [item.get( 
 'url' 
 ) 
 for 
  
 item 
 in 
  
 data.get( 
 'sub_images' 
 )] 

 
 for 
  
 image 
 in 
  
 images:

 
 # 
 下载图片

 
  
 download_image(image) 

 
  
 return 
  
 { 
 'title' 
 :title, 
 'url' 
 :url, 
 'images' 
 :images} 

 
 except 
  
 JSONDecodeError 
 as 
  
 e 
 :

 
 return 
  
 None

 
 return 
  
 None

 
 # 
 获取图片的二进制流 

 
 def 
  
 download_image 
 (url): 

 
 try 
 :

 
  
 print 
 ( 
 ' 
 图片 
 ' 
 +url+ 
 ' 
 正在下载 
 ' 
 ) 

 
 response = requests.get(url) 

 
 if 
  
 response.status_code == 
 200 
 :

 
 # 
 保存图片

 
  
 save_image(response.content) 

 
 except 
  
 RequestException 
 as 
  
 e 
 :

 
  
 print 
 ( 
 ' 
 异常 
 image:' 
 +url) 

 
  
 pass 

 
 # 
 保存二进制流至文件 

 
 def 
  
 save_image 
 (content): 

 
 file_path = 
 '{0}/images/{1}.{2}' 
 .format(os.getcwd(), md5(content).hexdigest(),  
 'jpg' 
 ) 

 
 if 
  
 not 
  
 os.path.exists(file_path):

 
  
 with 
  
 open 
 (file_path, 
 'wb+' 
 ) 
 as 
  
 file: 

 
 file.write(content) 

 
 file.close() 

 
 def 
  
 save_to_mongo 
 (data): 

 
 if 
  
 db[MONGO_TABLE].insert(data):

 
  
 print 
 ( 
 ' 
 成功保存 
 ' 
 +data[ 
 'title' 
 ]) 

 
 return 
  
 True

 
 return 
  
 False

 
 # 
 中心调度 

 
 def 
  
 main 
 (offset): 

 
 # 
 获取列表页

 
  
 index_data = get_page_index(offset,KEYWORDS) 

 
 if 
  
 index_data 
 is 
  
 None 
 :

 
  
 print 
 ( 
 "offset:" 
 +offset+ 
 " 
 异常 
 " 
 ) 

 
  
 return 

 
  
 # 
 解析列表页获取所有详情页的 
 url 

 
 for 
  
 url 
 in 
  
 parse_page_index(index_data):

 
 # 
 获取详情页

 
  
 detail_data = get_page_detail(url) 

 
 if 
  
 detail_data 
 is 
  
 None 
 :

 
  
 print 
 ( 
 'url:%s 
 异常 
 ' 
 .format(url)) 

 
  
 pass 

 
 # 
 解析详情页

 
  
 data = parse_page_detail(detail_data, url) 

 
 if 
  
 data 
 is 
  
 None 
 :

 
  
 continue 

 
  
 save_to_mongo(data) 

 
 if 
  
 __name__== 
 '__main__' 
 : 

 
 groups = [x* 
 20 
  
 for 
  
 x 
 in 
  
 range 
 (GROUP_START,GROUP_END+ 
 1 
 )] 

 
 pool = Pool() 

 
 pool.map(main, groups) 

 
 7、运行结果 

Clipboard Image.png

Clipboard Image.png

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
今日头条街拍爬取

本次练习使用到的知识点有* Requests 库的使用* BeautifulShop 库的使用* 正则表达式的使用* pymongo 库的使用1、项目流程分析2、中心调度#中心调度defmain(offset):#获取列表页index_data = get_page_index(offset,KEYWORDS)
复制链接

扫一扫

专栏目录

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。