今日头条街拍图片爬虫

From bs4 import BeautifulSoup
 import re
 import requests
 from urllib.parse import urlencode
 import json
 from requests.exceptions import RequestException
 import os
 from hashlib import md5
 from multiprocessing import Pool

Def get_page_index ( offset , keyword ) :
     data = {
         "offset" : offset ,
         "format" : "json" ,
         "keyword" : keyword ,
         "autoload" : "true" ,
         "count" : 20 ,
         "cur_tab" : 1 ,
    }
    Url = "https://www.toutiao.com/search_content/?" + urlencode (data)
    User_agent = " "
     headers = { "User-Agent" : user_agent}   # Hide reptiles as Mac users (fixed format)
     reponse = requests. get (url, headers = headers)
     return reponse.text

DEF get_page ( HTML ) :
     the Data = . json loads ( HTML )
     IF the Data and "the Data" in . the Data Keys () :
 for Item in . the Data GET ( "the Data" ) :
 yield . Item GET ( "article_url" )                    

Def get_page_detail ( url ) :
     response = requests. get ( url )
     return response.text

Def paese_page ( html ) :
     soup = BeautifulSoup ( html , "lxml" )
    Title = soup. select ( "title" )[ 0 ]. get_text ()
     print ( " street title: " + title)
    Images_url = re. compile ( r"src="//.*?(.*?)"" , re.S)
    Result = images_url. findall ( html )
     for url in result :
 print ( " picture name: " + url)
         download_images (url)        

Def download_images ( url ) :
 try :
 response = requests. get ( "https://" + url )
         if response.status_code == 200 :
 print ( " downloading " + url )
             save_images (response.content)
         return None
     except RequestException :
 print ( url + " Download failed " )                                

Def save_images ( content ) :
     file_path = '{0}/{1}.{2}' . format ( "F:/anaconda/ Street shot " , md5 ( content ). hexdigest (), "jpg" ) if not os .path. exists (file_path) :
 with open (file_path, "wb" ) as f :
 f. write ( content )
                        
            f. close ()

Def main ( offset ) :
     keyword = " street shot "
     html = get_page_index ( offset , keyword)
     for url in get_page (html) :
 try :
 html = get_page_detail (url)
             if html :
 paese_page (html)
         except RequestException :
 print ( "Error " )                                                

If __name__ == '__main__' :
     start = 1
     end = 2
     groups = [x * 20 for x in range (start, end + 1 )]
     # main(groups)
     pool = Pool ()
    Pool. map (main, groups)
    Pool. close ()
    Pool. join ()   # main process blocking waiting for the exit of the child process
 print ( " crawling end! " )
  

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值