From bs4 import BeautifulSoup
import re
import requests
from urllib.parse import urlencode
import json
from requests.exceptions import RequestException
import os
from hashlib import md5
from multiprocessing import Pool
Def get_page_index ( offset , keyword ) :
data = {
"offset" : offset ,
"format" : "json" ,
"keyword" : keyword ,
"autoload" : "true" ,
"count" : 20 ,
"cur_tab" : 1 ,
}
Url = "https://www.toutiao.com/search_content/?" + urlencode (data)
User_agent = " "
headers = { "User-Agent" : user_agent} # Hide reptiles as Mac users (fixed format)
reponse = requests. get (url, headers = headers)
return reponse.text
DEF get_page ( HTML ) :
the Data = . json loads ( HTML )
IF the Data and "the Data" in . the Data Keys () :
for Item in . the Data GET ( "the Data" ) :
yield . Item GET ( "article_url" )
Def get_page_detail ( url ) :
response = requests. get ( url )
return response.text
Def paese_page ( html ) :
soup = BeautifulSoup ( html , "lxml" )
Title = soup. select ( "title" )[ 0 ]. get_text ()
print ( " street title: " + title)
Images_url = re. compile ( r"src="//.*?(.*?)"" , re.S)
Result = images_url. findall ( html )
for url in result :
print ( " picture name: " + url)
download_images (url)
Def download_images ( url ) :
try :
response = requests. get ( "https://" + url )
if response.status_code == 200 :
print ( " downloading " + url )
save_images (response.content)
return None
except RequestException :
print ( url + " Download failed " )
Def save_images ( content ) :
file_path = '{0}/{1}.{2}' . format ( "F:/anaconda/ Street shot " , md5 ( content ). hexdigest (), "jpg" ) if not os .path. exists (file_path) :
with open (file_path, "wb" ) as f :
f. write ( content )
f. close ()
Def main ( offset ) :
keyword = " street shot "
html = get_page_index ( offset , keyword)
for url in get_page (html) :
try :
html = get_page_detail (url)
if html :
paese_page (html)
except RequestException :
print ( "Error " )
If __name__ == '__main__' :
start = 1
end = 2
groups = [x * 20 for x in range (start, end + 1 )]
# main(groups)
pool = Pool ()
Pool. map (main, groups)
Pool. close ()
Pool. join () # main process blocking waiting for the exit of the child process
print ( " crawling end! " )