一个比较简单的爬虫,获取网站作品,可供参考,爬取日志调用库 logging ,只需一行配置代码即可实现日志文件的记录,方便查看爬虫运行情况,非常方便而且好用!

写的比较啰嗦,比较渣,可以实现页面详情内容的获取,包括标题、描述以及图片下载文件。

Python爬虫,红点奖Red获奖作品爬取及文件下载!_爬虫

爬取日志调用库:

# 日志的基本配置
logging.basicConfig(filename='access.log',
                    format='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S %p',
                    level=10)


logging.debug('调试信息')  # 10
logging.info('正常信息')  # 20
logging.warning('警告信息')  # 30
logging.error('报错信息')  # 40
logging.critical('严重错误信息')  # 50
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.

附源码参考:

#红点奖作品爬取
#公众号:Python与SEO学习
import requests,os,re
import random,time
import logging
from lxml import etree


# 日志的基本配置
logging.basicConfig(filename='access.log',
                    format='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S %p',
                    level=10)




def get_ua():
    ua_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
    ]
    ua=random.choice(ua_list)
    return ua




def get_result(pagenum):
    url=f"https://www.red-dot.org/search/search.json?solr%5Bfilter%5D%5B%5D=meta_categories%3A%2F11%2F&solr%5Bfilter%5D%5B%5D=year%3A2024&solr%5Bpage%5D={pagenum}"
    response=get_resp(url)
    results=response.json()['result']['docs']
    print(results)
    for result in results:
        title=f'{result['title']}_{result['meta_first']}_{result['meta_fourth']}'
        url=f'https://www.red-dot.org{result['url']}'
        with open(f'{pagenum}.txt','a+',encoding='utf-8') as f:
            f.write(f'{url}\n')
        get_detail(title,url)
        time.sleep(2)






def get_detail(title,url):
    pattern = r"[\/\\\:\*\?\"\<\>\|]"
    title = re.sub(pattern, "_", title)  # 替换为下划线
    path = f'{title}/'
    os.makedirs(path, exist_ok=True)
    logging.info(f'生成{title}文件目录成功!')
    print(f'正在获取{url}详情页内容信息..')
    logging.info(f'正在获取{url}详情页内容信息..')
    response=get_resp(url)
    html=response.content.decode('utf-8')
    tree=etree.HTML(html)
    hs=tree.xpath('//div[@class="col-12"]//text()')
    h=''.join(hs)
    print(h)
    texts=tree.xpath('//div[@class="col-12 col-md-10 offset-md-1"]//text()')
    text='\n'.join(texts)
    print(text)
    with open(f'{path}{title}.txt','w',encoding='utf-8') as f:
        f.write(f'{h}\n\n{text}')
    print(f"保存{title}.txt文件成功!")
    logging.info(f"保存{title}.txt文件成功!")
    imgs=tree.xpath('//div[@class="col-12 col-md-10 offset-md-1"]//img/@src')
    print(imgs)
    get_imgs(path, imgs)
    downhref=tree.xpath('//a[@class="download-link"]/@href')[0]
    print(downhref)
    download_file(path, title, downhref)
    print(f'获取{url}详情页内容信息成功!')
    logging.info(f'获取{url}详情页内容信息成功!')








def get_imgs(path,imgs):
    i=1
    for img in imgs:
        imgname=f'{i}-{img.split('/')[-1]}'
        imgurl=f'https://www.red-dot.org{img}'
        print(f">>开始下载图片:{imgname}")
        logging.info(f">>开始下载图片:{imgname}")
        r = get_resp(imgurl)
        with open(f'{path}{imgname}', 'wb') as f:
            f.write(r.content)
        print(f"下载图片:{imgname}完毕!")
        logging.info(f"下载图片:{imgname}完毕!")
        i=i+1
        time.sleep(1)




def download_file(path,title,downhref):
    downname=f'{title}.zip'
    print(f">>开始下载图片文件:{downname}")
    logging.info(f">>开始下载图片:{downname}")
    r = get_resp(downhref)
    with open(f'{path}{downname}', 'wb') as f:
        f.write(r.content)
    print(f"下载图片文件:{downname}完毕!")
    logging.info(f"下载图片文件:{downname}完毕!")
    time.sleep(2)






# 3次重试
def get_resp(url):
    i = 0
    while i < 4:
        try:
            response = get_response(url,time=10)
            # print(response.status_code)
            return response
        except requests.exceptions.RequestException:
            i += 1
            print(f">> 获取网页出错,{i*2}S后将重试获取第:{i} 次")
            logging.error(f">> {url}---获取网页出错,{i*2}S后将重试获取第:{i} 次")
            time.sleep(i * 2)






def get_response(url, time):
    ua = random.choice(get_ua())
    headers = {
        'User-Agent': ua,
    }
    response = requests.get(url=url, headers=headers, timeout=time)
    return response






def main():
    for pagenum in range(1,11):
        print(f'>> 正在获取第{pagenum}页json列表页内容信息..')
        logging.info(f'>> 正在获取第{pagenum}页json列表页内容信息..')
        get_result(pagenum)
        time.sleep(6)






if __name__=="__main__":
    main()
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
  • 58.
  • 59.
  • 60.
  • 61.
  • 62.
  • 63.
  • 64.
  • 65.
  • 66.
  • 67.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.
  • 73.
  • 74.
  • 75.
  • 76.
  • 77.
  • 78.
  • 79.
  • 80.
  • 81.
  • 82.
  • 83.
  • 84.
  • 85.
  • 86.
  • 87.
  • 88.
  • 89.
  • 90.
  • 91.
  • 92.
  • 93.
  • 94.
  • 95.
  • 96.
  • 97.
  • 98.
  • 99.
  • 100.
  • 101.
  • 102.
  • 103.
  • 104.
  • 105.
  • 106.
  • 107.
  • 108.
  • 109.
  • 110.
  • 111.
  • 112.
  • 113.
  • 114.
  • 115.
  • 116.
  • 117.
  • 118.
  • 119.
  • 120.
  • 121.
  • 122.
  • 123.
  • 124.
  • 125.
  • 126.
  • 127.
  • 128.
  • 129.
  • 130.
  • 131.
  • 132.
  • 133.
  • 134.
  • 135.
  • 136.
  • 137.
  • 138.
  • 139.
  • 140.
  • 141.
  • 142.
  • 143.
  • 144.
  • 145.
  • 146.
  • 147.
  • 148.
  • 149.
  • 150.
  • 151.
  • 152.
  • 153.
  • 154.
  • 155.
  • 156.
  • 157.
  • 158.
  • 159.
  • 160.
  • 161.
  • 162.
  • 163.
  • 164.
  • 165.
  • 166.
  • 167.
  • 168.
  • 169.

改进版:

#红点奖作品爬取
import requests,os,re
import random,time
import logging
from lxml import etree


# 日志的基本配置
logging.basicConfig(filename='access.log',
                    format='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S %p',
                    level=10)




def get_ua():
    ua_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
    ]
    ua=random.choice(ua_list)
    return ua




def get_result(pagenum):
    url=f"https://www.red-dot.org/search/search.json?solr%5Bfilter%5D%5B%5D=meta_categories%3A%2F11%2F&solr%5Bfilter%5D%5B%5D=year%3A2024&solr%5Bpage%5D={pagenum}"
    response=get_resp(url)
    results=response.json()['result']['docs']
    print(results)
    for result in results:
        title=f'{result['title']}_{result['meta_first']}_{result['meta_fourth']}'
        url=f'https://www.red-dot.org{result['url']}'
        if pagenum==86:
            furls=quurls(pagenum)
            if url in furls:
                print(f"{url}--已经采集,跳过!")
                logging.info(f"{url}--已经采集,跳过!")
                pass
            else:
                with open(f'{pagenum}.txt', 'a+', encoding='utf-8') as f:
                    f.write(f'{url}\n')
                try:
                    get_detail(title, url)
                except Exception as e:
                    print(f"{e}--{url}")
                    logging.error(f"{e}--{url}")
                    with open('fail.txt', 'a+', encoding='utf-8') as f:
                        f.write(f"{e}--{url}")
                time.sleep(2)
        else:
            with open(f'{pagenum}.txt','a+',encoding='utf-8') as f:
                f.write(f'{url}\n')
            try:
                get_detail(title,url)
            except Exception as e:
                print(f"{e}--{url}")
                logging.error(f"{e}--{url}")
                with open('fail.txt','a+',encoding='utf-8') as f:
                    f.write(f"{e}--{url}")
            time.sleep(2)






def get_detail(title,url):
    pattern = r"[\/\\\:\*\?\"\<\>\|]-’:"
    title = re.sub(pattern, "_", title)  # 替换为下划线
    title=title.replace("\n", " ")
    path = f'{title}/'
    os.makedirs(path, exist_ok=True)
    logging.info(f'生成{title}文件目录成功!')
    print(f'正在获取{url}详情页内容信息..')
    logging.info(f'正在获取{url}详情页内容信息..')
    response=get_resp(url)
    html=response.content.decode('utf-8')
    tree=etree.HTML(html)
    hs=tree.xpath('//div[@class="col-12"]//text()')
    h=''.join(hs)
    print(h)
    texts=tree.xpath('//div[@class="col-12 col-md-10 offset-md-1"]//text()')
    text='\n'.join(texts)
    print(text)
    with open(f'{path}{title}.txt','w',encoding='utf-8') as f:
        f.write(f'{h}\n\n{text}')
    print(f"保存{title}.txt文件成功!")
    logging.info(f"保存{title}.txt文件成功!")
    imgs=tree.xpath('//div[@class="col-12 col-md-10 offset-md-1"]//img/@src')
    print(imgs)
    get_imgs(path, imgs)
    downhref=tree.xpath('//a[@class="download-link"]/@href')[0]
    print(downhref)
    download_file(path, title, downhref)
    print(f'获取{url}详情页内容信息成功!')
    logging.info(f'获取{url}详情页内容信息成功!')








def get_imgs(path,imgs):
    i=1
    for img in imgs:
        imgname=f'{i}-{img.split('/')[-1]}'
        imgurl=f'https://www.red-dot.org{img}'
        print(f">>开始下载图片:{imgname}")
        logging.info(f">>开始下载图片:{imgname}")
        r = get_resp(imgurl)
        with open(f'{path}{imgname}', 'wb') as f:
            f.write(r.content)
        print(f"下载图片:{imgname}完毕!")
        logging.info(f"下载图片:{imgname}完毕!")
        i=i+1
        time.sleep(1)




def download_file(path,title,downhref):
    downname=f'{title}.zip'
    print(f">>开始下载图片文件:{downname}")
    logging.info(f">>开始下载图片:{downname}")
    r = get_resp(downhref)
    with open(f'{path}{downname}', 'wb') as f:
        f.write(r.content)
    print(f"下载图片文件:{downname}完毕!")
    logging.info(f"下载图片文件:{downname}完毕!")
    time.sleep(2)






# 3次重试
def get_resp(url):
    i = 0
    while i < 4:
        try:
            response = get_response(url,time=10)
            # print(response.status_code)
            return response
        except requests.exceptions.RequestException:
            i += 1
            print(f">> 获取网页出错,{i*2}S后将重试获取第:{i} 次")
            logging.error(f">> {url}---获取网页出错,{i*2}S后将重试获取第:{i} 次")
            time.sleep(i * 2)






def get_response(url, time):
    ua = random.choice(get_ua())
    headers = {
        'User-Agent': ua,
    }
    response = requests.get(url=url, headers=headers, timeout=time)
    return response


def quurls(page):
    with open(f'{page}.txt','r',encoding='utf-8') as f:
        furls=f.readlines()


    urls=[]
    for furl in furls:
        urls.append(furl.strip())
    print(urls)
    return urls




def main():
    for pagenum in range(1,11):
        print(f'>> 正在获取第{pagenum}页json列表页内容信息..')
        logging.info(f'>> 正在获取第{pagenum}页json列表页内容信息..')
        get_result(pagenum)
        time.sleep(6)






if __name__=="__main__":
    main()
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
  • 58.
  • 59.
  • 60.
  • 61.
  • 62.
  • 63.
  • 64.
  • 65.
  • 66.
  • 67.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.
  • 73.
  • 74.
  • 75.
  • 76.
  • 77.
  • 78.
  • 79.
  • 80.
  • 81.
  • 82.
  • 83.
  • 84.
  • 85.
  • 86.
  • 87.
  • 88.
  • 89.
  • 90.
  • 91.
  • 92.
  • 93.
  • 94.
  • 95.
  • 96.
  • 97.
  • 98.
  • 99.
  • 100.
  • 101.
  • 102.
  • 103.
  • 104.
  • 105.
  • 106.
  • 107.
  • 108.
  • 109.
  • 110.
  • 111.
  • 112.
  • 113.
  • 114.
  • 115.
  • 116.
  • 117.
  • 118.
  • 119.
  • 120.
  • 121.
  • 122.
  • 123.
  • 124.
  • 125.
  • 126.
  • 127.
  • 128.
  • 129.
  • 130.
  • 131.
  • 132.
  • 133.
  • 134.
  • 135.
  • 136.
  • 137.
  • 138.
  • 139.
  • 140.
  • 141.
  • 142.
  • 143.
  • 144.
  • 145.
  • 146.
  • 147.
  • 148.
  • 149.
  • 150.
  • 151.
  • 152.
  • 153.
  • 154.
  • 155.
  • 156.
  • 157.
  • 158.
  • 159.
  • 160.
  • 161.
  • 162.
  • 163.
  • 164.
  • 165.
  • 166.
  • 167.
  • 168.
  • 169.
  • 170.
  • 171.
  • 172.
  • 173.
  • 174.
  • 175.
  • 176.
  • 177.
  • 178.
  • 179.
  • 180.
  • 181.
  • 182.
  • 183.
  • 184.
  • 185.
  • 186.
  • 187.
  • 188.
  • 189.
  • 190.
  • 191.
  • 192.
  • 193.
  • 194.
  • 195.
  • 196.
  • 197.
  • 198.
  • 199.
  • 200.
  • 201.
  • 202.
  • 203.

·················END·················

你好,我是二大爷,

革命老区外出进城务工人员,

互联网非早期非专业站长,

喜好python,写作,阅读,英语

不入流程序,自媒体,seo . . .


Python爬虫,红点奖Red获奖作品爬取及文件下载!_爬虫_02

Python爬虫,红点奖Red获奖作品爬取及文件下载!_爬虫_03

关注我的都变秃了

说错了,都变强了!

不信你试试

Python爬虫,红点奖Red获奖作品爬取及文件下载!_下载图片_04

扫码关注最新动态

公众号ID:eryeji