from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import requests
import threading
import os
import re
def gethtml(url):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#上面是设置无界浏览器用的,不用打开浏览器也可以爬取数据!
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
#对chrome开启开发者模式
driver = webdriver.Chrome(options=chrome_options)
# 设置无头浏览器,就是隐藏界面后台运行
driver.get(url)
driver.refresh()# 有验证码,刷新一下,就好了
sleep(3)
#等浏览器反应3秒钟,设置沉睡,否则加载不到我们想要的数据!
soup = driver.page_source
driver.close()
driver.quit()
return soup
def parserurl(soup):
tapm = re.compile('<div id="(.*?)".*?href="/group/(.*?)">')
htmldata=re.findall(tapm,soup)
return htmldata
def geturls(html):
temp = re.compile('<img src="(http.*?)"')
pict = re.findall(temp, html)
print(pict)
return pict
# def getpicurl():
# def getpict():
# pass
def main():
url='https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D'
url_1='https://www.toutiao.com/'
html=gethtml(url)
htmlurl=parserurl(html)
data=[]
filname=[]
dataurl=[]
print(dataurl)
for i in range(len(htmlurl)):
filname.append(htmlurl[i][0])
dataurl.append(url_1+'a'+htmlurl[i][1])
for j in range(len(filname)):
pict=gethtml(dataurl[j])
urls=geturls(pict)
a={filname[j]:urls}
data.append(a)
print(filname)
#多线程开始
class A(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for k in range(0, len(data), 2):
os.chdir("E:\桌面\\222")
os.mkdir(filname[k])
m = len(data[k][filname[k]])
for n in range(m):
with open('E:/桌面/222/' + filname[k] + '/' + str(n + 1) + '.jpg', 'wb') as f:
req = requests.get(data[k][filname[k]][n])
datapic = req.content
f.write(datapic)
f.close()
class B(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for k in range(1, len(data), 2):
os.chdir("E:\桌面\\222")
os.mkdir(filname[k])
m = len(data[k][filname[k]])
for n in range(m):
with open('E:/桌面/222/' + filname[k] + '/' + str(n + 1) + '.jpg', 'wb') as f:
req = requests.get(data[k][filname[k]][n])
datapic = req.content
f.write(datapic)
f.close()
t1 = A()
t1.start()
t2 = B()
t2.start()
main()
今日头条的街拍下载实现 python
最新推荐文章于 2022-11-21 19:03:57 发布