公司要求每个季度内网存档,把内网里的文章拷贝到word文档里。从15年3月一进公司被这个东西折腾的不成人样,一开始一篇文章一篇文章拷贝,调整文字大小,段落间距,一丝不苟需要好几个晚上。后来滑头一点了就一起拷贝完了再统一调整格式。但也还是需要大半天时间。直到最近终于用Python基本做到了自动保存到docx,除了表格还不能处理之外。
下面这段代码是基本上没有注释的。
#-*- coding:utf-8 -*-#
import urllib
import requests
from bs4 import BeautifulSoup
import re
from docx import Document
from docx.shared import Inches
urls = []
indexs = []
for catid in [71,72,74,75,76]:
index = 'http://192.168.9.240:9191/index.php?m=content&c=index&a=lists&catid=%d' % (catid) #页面url
page = requests.get(index) #requests.get获取页面
soup = BeautifulSoup(page.text,'html.parser')
getNum= re.compile(r"\d+")#正则
pageNum = getNum.match(soup.select('#pages > a')[0].text)
for pagefoot in range(1,int(pageNum.group(0))/15+2,1):
index1 = index + '&page=%d'% (pagefoot)
indexs.append(index1)
for index2 in indexs:
soup1 = BeautifulSoup(requests.get(index2).text,'html.parser')
for list in soup1.select('.mainshow > ul > li'):
if not list.select('.rt') == []:
if list.select('.rt')[0].text < '2018-07-01' and list.select('.rt')[0].text > '2018-03-31':
for url in list.select('.s3'):
urls.append(url.get('href'))
def copy_the_article():
global url
global file
b = requests.get(url)
soup = BeautifulSoup(b.text,'html.parser')
for news in soup.select('#Article'):
header=news.select('h1')[0].text
file.add_paragraph(header)
for p in news.select('.content > p'):
content = p.text
file.add_paragraph(content)
img_link = []
for img in p.select('img'):
img_link.append(img.get('src'))
if img_link != []:
for img_url in img_link:
if img_url[-4:] == '.gif':
break
else:
pass
if img_url[:4] == 'http':
urllib.urlretrieve(img_url,filename=img_url[-21:])
file.add_picture(img_url[-21:],width=Inches(3))
else:
urllib.urlretrieve('http://192.168.9.240:9191'+ img_url,filename=img_url[-20:])
file.add_picture(img_url[-20:],width=Inches(3))
print "the program starting...."
file = Document()
for url in urls:
copy_the_article()
file.save("testt.docx")
再贴一个前一个版本,是保存到txt文档的。
#-*- coding:utf-8 -*-#
#抓取内网文章,下载文章中的图片
#BeautifulSoup 的select方法获取selector是一个难点。
#浏览器开发者工具 F12 右键 --copy --copy selector.
#例如 #pages > a .mainshow > ul > li
#写到article.txt里面。
## f=open('article.txt','a+')
## f.write(header.encode('utf-8'))
## f.write("\n")
## f.close()
import urllib
import requests
from bs4 import BeautifulSoup
import re
urls = []
indexs = []
for catid in [71,72,74,75,76]:
index = 'http://192.168.9.240:9191/index.php?m=content&c=index&a=lists&catid=%d' % (catid) #页面url
page = requests.get(index) #requests.get获取页面
soup = BeautifulSoup(page.text,'html.parser')
#这里添加&page=2...
getNum= re.compile(r"\d+")#正则
pageNum = getNum.match(soup.select('#pages > a')[0].text)
## print pageNum.group(0)#取正则表达式匹配的值即数字,页面中显示的文章总条数
for pagefoot in range(1,int(pageNum.group(0))/15+2,1):
index1 = index + '&page=%d'% (pagefoot)
## print index1
indexs.append(index1)
for index2 in indexs:
soup1 = BeautifulSoup(requests.get(index2).text,'html.parser')
for list in soup1.select('.mainshow > ul > li'):
if not list.select('.rt') == []:
#这里修改需要抓取的文章的发布日期
if list.select('.rt')[0].text < '2018-07-01' and list.select('.rt')[0].text > '2018-03-31':
for url in list.select('.s3'):
urls.append(url.get('href'))
## print url.get('href')
#获取内网标题和正文(也下载图片)
def copy_the_article():
'''
copy the article
'''
global url
b = requests.get(url)
soup = BeautifulSoup(b.text,'html.parser')
## print soup.prettify()
for news in soup.select('#Article'):
header=news.select('h1')[0].text
## content = news.select('.content')[0].text
f=open('article.txt','a+')
f.write(header.encode('utf-8'))
f.write("\n")
f.close()
## 因为for p in news.select ,打印文字和打印照片在同一个循环内,
## 处理的是p标签,所以文字和图片一起处理了,
## 在文章原来的位置打印出了图片链接。
for p in news.select('.content > p'):
content = p.text
f=open('article.txt','a+')
f.write(content.encode('utf-8'))
f.write("\n")
f.close()
img_link = []
for img in p.select('img'):
img_link.append(img.get('src'))
if img_link != []:
for img_url in img_link:
if img_url[-4:] == '.gif':
break
else:
f=open('article.txt','a+')
f.write(img_url.encode('utf-8'))
f.write("\n")
f.close()
## print img_url[:4]
## print len(img_url)
if img_url[:4] == 'http':
urllib.urlretrieve(img_url,filename=img_url[-21:])
else:
urllib.urlretrieve('http://192.168.9.240:9191'+ img_url,filename=img_url[-20:])
#the mian.
print "the program starting...."
for url in urls:
copy_the_article()
##for url in urls:
## print url