今日头条街拍图片爬虫

最新推荐文章于 2021-07-05 16:31:10 发布

Luzaofa

最新推荐文章于 2021-07-05 16:31:10 发布

阅读量749

点赞数

文章标签：今日头条街拍

本文链接：https://blog.csdn.net/Luzaofa/article/details/79712202

版权

From bs4 import BeautifulSoup
 import re
 import requests
 from urllib.parse import urlencode
 import json
 from requests.exceptions import RequestException
 import os
 from hashlib import md5
 from multiprocessing import Pool

Def get_page_index ( offset , keyword ) :
     data = {
         "offset" : offset ,
         "format" : "json" ,
         "keyword" : keyword ,
         "autoload" : "true" ,
         "count" : 20 ,
         "cur_tab" : 1 ,
    }
    Url = "https://www.toutiao.com/search_content/?" + urlencode (data)
    User_agent = " "
     headers = { "User-Agent" : user_agent}   # Hide reptiles as Mac users (fixed format)
     reponse = requests. get (url, headers = headers)
     return reponse.text

DEF get_page ( HTML ) :
     the Data = . json loads ( HTML )
     IF the Data and "the Data" in . the Data Keys () :
 for Item in . the Data GET ( "the Data" ) :
 yield . Item GET ( "article_url" )                    

Def get_page_detail ( url ) :
     response = requests. get ( url )
     return response.text

Def paese_page ( html ) :
     soup = BeautifulSoup ( html , "lxml" )
    Title = soup. select ( "title" )[ 0 ]. get_text ()
     print ( " street title: " + title)
    Images_url = re. compile ( r"src="//.*?(.*?)"" , re.S)
    Result = images_url. findall ( html )
     for url in result :
 print ( " picture name: " + url)
         download_images (url)        

Def download_images ( url ) :
 try :
 response = requests. get ( "https://" + url )
         if response.status_code == 200 :
 print ( " downloading " + url )
             save_images (response.content)
         return None
     except RequestException :
 print ( url + " Download failed " )                                

Def save_images ( content ) :
     file_path = '{0}/{1}.{2}' . format ( "F:/anaconda/ Street shot " , md5 ( content ). hexdigest (), "jpg" ) if not os .path. exists (file_path) :
 with open (file_path, "wb" ) as f :
 f. write ( content )
                        
            f. close ()

Def main ( offset ) :
     keyword = " street shot "
     html = get_page_index ( offset , keyword)
     for url in get_page (html) :
 try :
 html = get_page_detail (url)
             if html :
 paese_page (html)
         except RequestException :
 print ( "Error " )                                                

If __name__ == '__main__' :
     start = 1
     end = 2
     groups = [x * 20 for x in range (start, end + 1 )]
     # main(groups)
     pool = Pool ()
    Pool. map (main, groups)
    Pool. close ()
    Pool. join ()   # main process blocking waiting for the exit of the child process
 print ( " crawling end! " )