内网文章存档by python

最新推荐文章于 2023-01-30 11:21:03 发布

taochifan

最新推荐文章于 2023-01-30 11:21:03 发布

阅读量311

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/gd4_txw/article/details/81982501

版权

python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

公司要求每个季度内网存档，把内网里的文章拷贝到word文档里。从15年3月一进公司被这个东西折腾的不成人样，一开始一篇文章一篇文章拷贝，调整文字大小，段落间距，一丝不苟需要好几个晚上。后来滑头一点了就一起拷贝完了再统一调整格式。但也还是需要大半天时间。直到最近终于用Python基本做到了自动保存到docx，除了表格还不能处理之外。

下面这段代码是基本上没有注释的。

#-*- coding:utf-8 -*-#
import urllib
import requests
from bs4 import BeautifulSoup
import re
from docx import Document
from docx.shared import Inches
urls = []
indexs = []
for catid in [71,72,74,75,76]:
    index = 'http://192.168.9.240:9191/index.php?m=content&c=index&a=lists&catid=%d' % (catid) #页面url
    page = requests.get(index) #requests.get获取页面
    soup = BeautifulSoup(page.text,'html.parser')
    getNum= re.compile(r"\d+")#正则
    pageNum = getNum.match(soup.select('#pages > a')[0].text)
    for pagefoot in range(1,int(pageNum.group(0))/15+2,1):
        index1 = index + '&page=%d'% (pagefoot)
        indexs.append(index1)
for index2 in indexs:
    soup1 = BeautifulSoup(requests.get(index2).text,'html.parser')
    for list in soup1.select('.mainshow > ul > li'):
        if not list.select('.rt') == []:        
            if list.select('.rt')[0].text < '2018-07-01' and list.select('.rt')[0].text > '2018-03-31':
                for url in list.select('.s3'):
                    urls.append(url.get('href'))
def copy_the_article():
    global url
    global file
    b = requests.get(url)
    soup = BeautifulSoup(b.text,'html.parser')
    for news in soup.select('#Article'):
        header=news.select('h1')[0].text
        file.add_paragraph(header)
        for p in news.select('.content > p'):
            content = p.text
            file.add_paragraph(content)
            img_link = []
            for img in p.select('img'):
                img_link.append(img.get('src'))
            if img_link != []:
                for img_url in img_link:
                    if img_url[-4:] == '.gif':
                        break
                    else:
                        pass
                    if img_url[:4] == 'http':
                        urllib.urlretrieve(img_url,filename=img_url[-21:])
                        file.add_picture(img_url[-21:],width=Inches(3))
                    else:
                        urllib.urlretrieve('http://192.168.9.240:9191'+ img_url,filename=img_url[-20:])
                        file.add_picture(img_url[-20:],width=Inches(3))      
print "the program starting...." 
file = Document()
for url in urls:
    copy_the_article()
file.save("testt.docx")

再贴一个前一个版本，是保存到txt文档的。

#-*- coding:utf-8 -*-#
#抓取内网文章，下载文章中的图片
#BeautifulSoup 的select方法获取selector是一个难点。
#浏览器开发者工具 F12 右键 --copy --copy selector.
#例如   #pages > a        .mainshow > ul > li
#写到article.txt里面。
##        f=open('article.txt','a+')
##        f.write(header.encode('utf-8'))
##        f.write("\n")
##        f.close()
import urllib
import requests
from bs4 import BeautifulSoup
import re
urls = []
indexs = []
for catid in [71,72,74,75,76]:
    index = 'http://192.168.9.240:9191/index.php?m=content&c=index&a=lists&catid=%d' % (catid) #页面url
    page = requests.get(index) #requests.get获取页面
    soup = BeautifulSoup(page.text,'html.parser')
    #这里添加&page=2...
    getNum= re.compile(r"\d+")#正则
    pageNum = getNum.match(soup.select('#pages > a')[0].text)
##    print pageNum.group(0)#取正则表达式匹配的值即数字,页面中显示的文章总条数
    for pagefoot in range(1,int(pageNum.group(0))/15+2,1):
        index1 = index + '&page=%d'% (pagefoot)
##        print index1
        indexs.append(index1)
for index2 in indexs:
    soup1 = BeautifulSoup(requests.get(index2).text,'html.parser')
    for list in soup1.select('.mainshow > ul > li'):
        if not list.select('.rt') == []:
            #这里修改需要抓取的文章的发布日期
            if list.select('.rt')[0].text < '2018-07-01' and list.select('.rt')[0].text > '2018-03-31':
                for url in list.select('.s3'):
                    urls.append(url.get('href'))
                    ##                print url.get('href')

#获取内网标题和正文（也下载图片）
def copy_the_article():
    '''
    copy the article
    '''
    global url
    b = requests.get(url)
    soup = BeautifulSoup(b.text,'html.parser')
##    print soup.prettify()
    for news in soup.select('#Article'):
        header=news.select('h1')[0].text
##    content = news.select('.content')[0].text
        f=open('article.txt','a+')
        f.write(header.encode('utf-8'))
        f.write("\n")
        f.close()

## 因为for p in news.select ，打印文字和打印照片在同一个循环内，
        ## 处理的是p标签，所以文字和图片一起处理了，
        ## 在文章原来的位置打印出了图片链接。
        for p in news.select('.content > p'):
            content = p.text
            f=open('article.txt','a+')
            f.write(content.encode('utf-8'))
            f.write("\n")
            f.close()
            img_link = []
            for img in p.select('img'):
                img_link.append(img.get('src'))
            if img_link != []:
                for img_url in img_link:
                    if img_url[-4:] == '.gif':
                        break
                    else:
                        f=open('article.txt','a+')
                        f.write(img_url.encode('utf-8'))
                        f.write("\n")
                        f.close()
##                    print img_url[:4]
##                print len(img_url)
                    if img_url[:4] == 'http':
                        urllib.urlretrieve(img_url,filename=img_url[-21:])
                    else:
                        urllib.urlretrieve('http://192.168.9.240:9191'+ img_url,filename=img_url[-20:])
       
#the mian.
print "the program starting...."                        
for url in urls:
    copy_the_article()

##for url in urls:
##    print url

taochifan

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
内网文章存档by python

公司要求每个季度内网存档，把内网里的文章拷贝到word文档里。从15年3月一进公司被这个东西折腾的不成人样，一开始一篇文章一篇文章拷贝，调整文字大小，段落间距，一丝不苟需要好几个晚上。后来滑头一点了就一起拷贝完了再统一调整格式。但也还是需要大半天时间。直到最近终于用Python基本做到了自动保存到docx，除了表格还不能处理之外。下面这段代码是基本上没有注释的。#-*- coding:...
复制链接

扫一扫