2021-07-29

最新推荐文章于 2021-07-30 11:31:27 发布

赖文强

最新推荐文章于 2021-07-30 11:31:27 发布

阅读量60

点赞数

分类专栏： py

本文链接：https://blog.csdn.net/weixin_42501279/article/details/119215774

版权

py 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

爬个虫

// 哟呼

// An highlighted block
import requests
from lxml import etree
from lxml import html
from html.parser import HTMLParser #导入html解析库
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import urllib.request
import os
import requests
from pathlib import Path
import random
def url_open(url):
    req = urllib.request.Request(url)
    #添加头部，伪装Goole浏览器
    req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    response = urllib.request.urlopen(req)
    html = response.read()
    sleeptime=random.randint(0, 15)
    time.sleep(sleeptime)
    response.close()
    return html

def w_file(filepath,contents):
    with open(filepath,'w',encoding='gb18030') as wf:
        wf.write(contents)
def save_imgs(folder,img_addrs):
    picDir = Path(folder)
    if picDir.exists()==False:
        os.mkdir(folder)
    os.chdir(folder)
    for each in img_addrs:
        filename = each.split('/')[-1]
        with open(filename,'wb') as f:
            img = url_open(each)
            f.write(img)
    os.chdir('..')
def getHTMLText(url):
        res = requests.get(url)
        res.encoding='GB2312'
        return res.text

        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36')
        driver = webdriver.PhantomJS(executable_path=r"C:\Users\47\Desktop\py\phantomjs-2.1.1-windows\bin\phantomjs.exe",desired_capabilities=dcap,service_args=['--ignore-ssl-errors=true'])
        driver.get(url)  # 获取网页
        #time.sleep(3)
        #driver.close()
        return driver.page_source
def findPicUrl(htmlText):
    img_addrs = []
    retPicUrls = []
    a = htmlText.find('/desk/')
    while a!=-1:
        b = htmlText.find('.htm',a,a+100)
        if b!= -1:
            img_addrs.append('http://www.netbian.com'+htmlText[a:b]+'-1920x1080.htm')
        else:
            b=a+30
        a=htmlText.find('/desk/',b+4)
    for each in img_addrs:
        print(each)
        picHtmlSources=getHTMLText(each)
        #w_file('file.txt',picHtmlSources)
        data = picHtmlSources.find('endimg')
        imgsrcBegin = picHtmlSources.find("<img src",data)
        imgsrcEnd = picHtmlSources.find(".jpg",imgsrcBegin)
       # print(picHtmlSources[imgsrcBegin+10:imgsrcEnd+4])
        retPicUrls.append(picHtmlSources[imgsrcBegin+10:imgsrcEnd+4])
    return retPicUrls
if __name__ == '__main__':
    i=0
    while i<10:
        baseAddr = 'http://www.netbian.com/index_%d.htm' %i
        html = getHTMLText(baseAddr)
        picUrls = findPicUrl(html)
        save_imgs("test",picUrls)
        i=i+1
    #w_file('file.txt',html)