爬个虫
// 哟呼
// An highlighted block
import requests
from lxml import etree
from lxml import html
from html.parser import HTMLParser #导入html解析库
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import urllib.request
import os
import requests
from pathlib import Path
import random
def url_open(url):
req = urllib.request.Request(url)
#添加头部,伪装Goole浏览器
req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
sleeptime=random.randint(0, 15)
time.sleep(sleeptime)
response.close()
return html
def w_file(filepath,contents):
with open(filepath,'w',encoding='gb18030') as wf:
wf.write(contents)
def save_imgs(folder,img_addrs):
picDir = Path(folder)
if picDir.exists()==False:
os.mkdir(folder)
os.chdir(folder)
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename,'wb') as f:
img = url_open(each)
f.write(img)
os.chdir('..')
def getHTMLText(url):
res = requests.get(url)
res.encoding='GB2312'
return res.text
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36')
driver = webdriver.PhantomJS(executable_path=r"C:\Users\47\Desktop\py\phantomjs-2.1.1-windows\bin\phantomjs.exe",desired_capabilities=dcap,service_args=['--ignore-ssl-errors=true'])
driver.get(url) # 获取网页
#time.sleep(3)
#driver.close()
return driver.page_source
def findPicUrl(htmlText):
img_addrs = []
retPicUrls = []
a = htmlText.find('/desk/')
while a!=-1:
b = htmlText.find('.htm',a,a+100)
if b!= -1:
img_addrs.append('http://www.netbian.com'+htmlText[a:b]+'-1920x1080.htm')
else:
b=a+30
a=htmlText.find('/desk/',b+4)
for each in img_addrs:
print(each)
picHtmlSources=getHTMLText(each)
#w_file('file.txt',picHtmlSources)
data = picHtmlSources.find('endimg')
imgsrcBegin = picHtmlSources.find("<img src",data)
imgsrcEnd = picHtmlSources.find(".jpg",imgsrcBegin)
# print(picHtmlSources[imgsrcBegin+10:imgsrcEnd+4])
retPicUrls.append(picHtmlSources[imgsrcBegin+10:imgsrcEnd+4])
return retPicUrls
if __name__ == '__main__':
i=0
while i<10:
baseAddr = 'http://www.netbian.com/index_%d.htm' %i
html = getHTMLText(baseAddr)
picUrls = findPicUrl(html)
save_imgs("test",picUrls)
i=i+1
#w_file('file.txt',html)