煎蛋网妹子图网址:'http://jandan.net/ooxx/'
版本:python 3.x
需要自己去下载配置selenium。
# -*- coding: utf-8 -*-
import os
import requests
from selenium.webdriver.support.wait import WebDriverWait
from selenium import webdriver
import re
from pyquery import PyQuery as pq
import threading
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
browser = webdriver.Chrome()
#wait = WebDriverWait(browser,25)
def fetch_html(url):
browser.get(url)
return browser.page_source
def parse_html(html):
doc = pq(html)
doc = pq(html)
items = doc('#comments .commentlist .row').items()
for item in items:
url = re.findall('<p>.*?<a.*?href="(.*?)".*?target.*?', str(item), re.S)
if url:
download_image("http:"+url[0])
print(url[0])
def url_open(url):
try:
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response.content
except requests.ConnectionError:
print("打开网页错误!")
return None
def download_image(url):
img = url_open(url)
if img:
filename = url.split('/')[-1]
with open(filename,'wb+') as f:
f.write(img)
if __name__=='__main__':
if os.path.exists("OOXX"):
os.chdir("OOXX")
else:
os.mkdir("OOXX")
os.chdir("OOXX")
for i in range(1,51):
print('第'+ i + '页')
url = 'http://jandan.net/ooxx/page-'+str(i)+'#comments'
html = fetch_html(url)
t = threading.Thread(target=parse_html,args=(html,))
t.start()