本文供自己参考学习,大体逻辑是利用python及xpath进行网页的内容抓取(带简单的翻页抓取),无图片内容简短直接存入mysql,带图片或内容过长存入mysql文件名称,保存内容至word放到路径1,保存图片放置路径2,在利用word宏进行搜索图片在文章的具体位置并粘贴图片、删除图片的链接文字。
# -*- coding: utf-8 -*-
import reimport os
import pymysql
import requests
import shutil
from lxml import etree
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import win32com
from win32com.client import constants
if os.path.exists("A"):
shutil.rmtree("A")
os.mkdir("A")
os.chdir("A")
page_num = [2] # 创建列表值
url = 'http://*********.cn/node_4374'
urls = "http://*********.cn/"
xpath_url2 = "//div[@class='xlayer02 yh ohd clear']/span[@class='fs1']//a/@href"
xpath_title = "//*[@id='layer213' or @id='title']//text()"
xpath_DeclareDate = "//*[@class='layer31' or @class='xt2 yh fl' or @class='layer31 xt2']//text()"
xpath_img = "//*[@id='imgContent' or @id='layer216']/center//@src"
word1_xpath = "//*[@id='imgContent' or @id='layer216']//text()|//*[@id='imgContent' or @id='layer216']/center//@src"
db = pymysql.connect('10.10.18.46', 'root', '', 'test', charset='utf8') # 链接数据库
cursor = db.cursor() # 操作游标
cursor.execute("DROP TABLE IF EXISTS news") # 判断是否存在名称为name变量的表,若存在删除
sql = """CREATE TABLE news (NewsID NVARCHAR(20),DeclareDate NVARCHAR(10) not null,ClassifyID
NVARCHAR(12),Classify NVARCHAR(60),Title NVARCHAR(200),Autor NVARCHAR(100),NewsSource
NVARCHAR(200),Industry NVARCHAR(100),Content TEXT,AccessoryName NVARCHAR(100))"""
cursor.execute(sql) # 执行sql语句
for z in page_num:
url2 = url + "_" + str(z) + ".htm" # url拼接
try:
content = requests.get(url2).content
html = etree.HTML(content)
scroll_list = html.xpath(xpath_url2)
for x in scroll_list: # 遍历scroll_list列表
my = "http://*********.cn/" + x
content = requests.get(my).content
html = etree.HTML(content)
try:
title = html.xpath(xpath_title)
title = ''.join(title).strip() # 拼接title1字符串
title = title
DeclareDate1 = html.xpath(xpath_DeclareDate)
DeclareDate1 = ''.join(DeclareDate1).strip() # 拼接DeclareDate1字符串
DeclareDate = re.search(r'20\d{2}[-]\d{1,2}[-]\d{1,2}', DeclareDate1).group() # 正则表达式
NewsSource1 = re.search(r'来源:\S*', DeclareDate1).group() # 正则表达式
NewsSource = re.sub(r"来源:", "", str(NewsSource1)) # 正则表达式
NewsSource = ''.join(NewsSource).strip() # 拼接NewsSource字符串
img_list = html.xpath(xpath_img)
word1 = html.xpath(word1_xpath)
word1 = '\n '.join(word1).strip() # 拼接word1字符串
word1 = word1.replace("\'", "\\'") # 转义’
word = word1
if ("下一页" and "末页") in word1:
index = url.rfind(".")
url2 = url[0: index]+"_2" + url[index:]
content = requests.get(url2).content
html = etree.HTML(content)
word2 = html.xpath(word1_xpath)
img_list2 = html.xpath(xpath_img)
img_list = img_list + img_list2
word = word + word2
if ("下一页" and "末页") in word2:
url3 = url[0: index]+"_3" + url[index:]
content = requests.get(url3).content
html = etree.HTML(content)
word3 = html.xpath(word1_xpath)
img_list3 = html.xpath(xpath_img)
img_list = img_list + img_list3
word = word + word3
if ("下一页" and "末页") in word3:
url4 = url[0: index]+"_4" + url[index:]
content = requests.get(url3).content
html = etree.HTML(content)
word4 = html.xpath(word1_xpath)
img_list4 = html.xpath(xpath_img)
img_list = img_list + img_list4
word = word + word4
if ("下一页" and "末页") in word4:
url5 = url[0: index]+"_5" + url[index:]
content = requests.get(url3).content
html = etree.HTML(content)
word5 = html.xpath(word1_xpath)
img_list5 = html.xpath(xpath_img)
img_list = img_list + img_list5
word = word + word5
else:
pass
else:
pass
else:
pass
else:
pass
if len(img_list) == 0: # img_list长度为0
if len(word) < 2000:
cursor = db.cursor() # 创建数据库游标
sql1 = ("INSERT INTO news(DeclareDate, Title, NewsSource, Industry, Content)\
VALUES ('%s', '%s', '%s', '%s', '%s')" % (DeclareDate, title, NewsSource, my, word))
cursor.execute(sql1) # 执行sql语句
db.commit() # 提交数据
elif len(word) > 2000:
chars = ["/", "\"", "'", "·", "。", "?", "!", ",", "、", ";", ":", "‘", "’",
"“", "”", "(", ")", "…", "–", ".", "《", "》"] # 创建列表值
new_title = ''
for i in range(len(title)): # 遍历title字符串
if title[i] not in chars: # 如果字符串中存在chars列表的字符,则替换为_
new_title += title[i]
else:
new_title += "_"
exam_doc = Document() # 创建word文档
heading = exam_doc.add_heading(new_title, 0) # 写入标题,0表示word的标题等级
heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 居中
exam_doc.add_paragraph(word) # word变量里的文字存入word文档
filename = "%s.docx" % new_title # 命名word文档
exam_doc.save(filename) # 保存
exam_doc.Close()
cursor = db.cursor() # 创建数据库游标
sql2 = ("INSERT INTO news(DeclareDate, Title, NewsSource, Industry, AccessoryName)\
VALUES ('%s', '%s', '%s', '%s', '%s')" % (DeclareDate, title, NewsSource, my, new_title))
cursor.execute(sql2) # 执行sql语句
db.commit() # 提交数据
elif len(img_list) != 0: # img_list长度不为0
chars = ["/", "\"", "'", "·", "。", "?", "!", ",", "、", ";", ":", "‘", "’",
"“", "”", "(", ")", "…", "–", ".", "《", "》"] # 创建列表值
new_title = ""
for i in range(len(title)): # 遍历title字符串
if title[i] not in chars: # 如果字符串中存在chars列表的字符,则替换为_
new_title += title[i]
else:
new_title += "_"
print(new_title)
exam_doc = Document() # 创建word文档
heading = exam_doc.add_heading(new_title, 0) # 写入标题,0表示word的标题等级
heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 居中
exam_doc.add_paragraph(word) # word变量里的文字存入word文档
filename = "%s.docx" % new_title # 命名word文档
exam_doc.save(filename) # 保存
os.chdir("D:\Python\\xvexi")
if os.path.exists("A图片"):
shutil.rmtree("A图片")
os.mkdir("D:\Python\\xvexi\A图片")
os.chdir("D:\Python\\xvexi\A图片")
for a in img_list: # 遍历img_list列表
a = a.replace("../../..", "http://*********.cn") # 替换字符创建新的url
index = a.rfind("/") # 用/对新的url(a)进行分拆
html = requests.get(a) # 新的访问url(a)
b = a[index+1:] # 用url(a)用/拆分后的最后一部分字符命名b
with open(b, 'wb') as file: # 用二进制方式打开b
file.write(html.content) # 写入文件
print(b)
try:
w = win32com.client.Dispatch('Word.Application')
w.Visible = 0
w.DisplayAlerts = 0
w.Documents.Open("D:\Python\\xvexi\A\\" + filename)
w.Run('Macro1')
w.Documents.Save()
w.Quit()
except:
print("粘贴图片失败!")
cursor = db.cursor() # 创建数据库游标
sql3 = ("INSERT INTO news(DeclareDate, Title, NewsSource, Industry, AccessoryName)\
VALUES ('%s', '%s', '%s', '%s', '%s')" % (DeclareDate, title, NewsSource, my, new_title))
cursor.execute(sql3) # 执行sql语句
db.commit() # 提交数据
os.chdir("D:\Python\\xvexi\A")
except:
print("网页不存在!")
except:
print("未获取到url!")
db.close() # 关闭数据库
Sub Macro1()
Dim file_path, Name As String
file_path = Dir("D:\Python\xvexi\A图片\")
n = 0
Do While file_path <> ""
Name = file_path
file_path = Dir
Selection.Find.Execute (Name)
Selection.MoveRight Unit:=wdCharacter, Count:=1
If Name <> "" Then
Selection.InlineShapes.AddPicture FileName:="D:\Python\xvexi\A图片\" + Name, LinkToFile:=False, SaveWithDocument:=True
Else: End If
n = n + 1
Loop
Selection.Find.Execute "../../../*" + Name, False, False, True, False, False, True, 1, True, "", 2
End Sub