带图片文章下载至word,不带图片则存至mysql

本文供自己参考学习,大体逻辑是利用python及xpath进行网页的内容抓取(带简单的翻页抓取),无图片内容简短直接存入mysql,带图片或内容过长存入mysql文件名称,保存内容至word放到路径1,保存图片放置路径2,在利用word宏进行搜索图片在文章的具体位置并粘贴图片、删除图片的链接文字。

# -*- coding: utf-8 -*-

import re
import os
import pymysql
import requests
import shutil
from lxml import etree
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import win32com
from win32com.client import constants
if os.path.exists("A"):
    shutil.rmtree("A")
os.mkdir("A")
os.chdir("A")
page_num = [2]  # 创建列表值
url = 'http://*********.cn/node_4374'
urls = "http://*********.cn/"
xpath_url2 = "//div[@class='xlayer02 yh ohd clear']/span[@class='fs1']//a/@href"
xpath_title = "//*[@id='layer213' or @id='title']//text()"
xpath_DeclareDate = "//*[@class='layer31' or @class='xt2 yh fl' or @class='layer31 xt2']//text()"
xpath_img = "//*[@id='imgContent' or @id='layer216']/center//@src"
word1_xpath = "//*[@id='imgContent' or @id='layer216']//text()|//*[@id='imgContent' or @id='layer216']/center//@src"
db = pymysql.connect('10.10.18.46', 'root', '', 'test', charset='utf8')  # 链接数据库
cursor = db.cursor()  # 操作游标
cursor.execute("DROP TABLE IF EXISTS news")  # 判断是否存在名称为name变量的表,若存在删除
sql = """CREATE TABLE news (NewsID NVARCHAR(20),DeclareDate NVARCHAR(10) not null,ClassifyID
         NVARCHAR(12),Classify NVARCHAR(60),Title NVARCHAR(200),Autor NVARCHAR(100),NewsSource
          NVARCHAR(200),Industry NVARCHAR(100),Content TEXT,AccessoryName NVARCHAR(100))"""
cursor.execute(sql)  # 执行sql语句
for z in page_num:
    url2 = url + "_" + str(z) + ".htm"   # url拼接
    try:
        content = requests.get(url2).content
        html = etree.HTML(content)
        scroll_list = html.xpath(xpath_url2)
        for x in scroll_list:  # 遍历scroll_list列表
            my = "http://*********.cn/" + x
            content = requests.get(my).content
            html = etree.HTML(content)
            try:
                title = html.xpath(xpath_title)
                title = ''.join(title).strip()  # 拼接title1字符串
                title = title
                DeclareDate1 = html.xpath(xpath_DeclareDate)
                DeclareDate1 = ''.join(DeclareDate1).strip()  # 拼接DeclareDate1字符串
                DeclareDate = re.search(r'20\d{2}[-]\d{1,2}[-]\d{1,2}', DeclareDate1).group()  # 正则表达式
                NewsSource1 = re.search(r'来源:\S*', DeclareDate1).group()  # 正则表达式
                NewsSource = re.sub(r"来源:", "", str(NewsSource1))  # 正则表达式
                NewsSource = ''.join(NewsSource).strip()  # 拼接NewsSource字符串
                img_list = html.xpath(xpath_img)
                word1 = html.xpath(word1_xpath)
                word1 = '\n  '.join(word1).strip()  # 拼接word1字符串
                word1 = word1.replace("\'", "\\'")  # 转义’
                word = word1
                if ("下一页" and "末页") in word1:
                    index = url.rfind(".")
                    url2 = url[0: index]+"_2" + url[index:]
                    content = requests.get(url2).content
                    html = etree.HTML(content)
                    word2 = html.xpath(word1_xpath)
                    img_list2 = html.xpath(xpath_img)
                    img_list = img_list + img_list2
                    word = word + word2
                    if ("下一页" and "末页") in word2:
                        url3 = url[0: index]+"_3" + url[index:]
                        content = requests.get(url3).content
                        html = etree.HTML(content)
                        word3 = html.xpath(word1_xpath)
                        img_list3 = html.xpath(xpath_img)
                        img_list = img_list + img_list3
                        word = word + word3
                        if ("下一页" and "末页") in word3:
                            url4 = url[0: index]+"_4" + url[index:]
                            content = requests.get(url3).content
                            html = etree.HTML(content)
                            word4 = html.xpath(word1_xpath)
                            img_list4 = html.xpath(xpath_img)
                            img_list = img_list + img_list4
                            word = word + word4
                            if ("下一页" and "末页") in word4:
                                url5 = url[0: index]+"_5" + url[index:]
                                content = requests.get(url3).content
                                html = etree.HTML(content)
                                word5 = html.xpath(word1_xpath)
                                img_list5 = html.xpath(xpath_img)
                                img_list = img_list + img_list5
                                word = word + word5
                            else:
                                pass
                        else:
                            pass
                    else:
                        pass
                else:
                    pass
                if len(img_list) == 0:  # img_list长度为0
                    if len(word) < 2000:
                        cursor = db.cursor()                    # 创建数据库游标
                        sql1 = ("INSERT INTO news(DeclareDate, Title, NewsSource, Industry, Content)\
                                  VALUES ('%s', '%s', '%s', '%s', '%s')" % (DeclareDate, title, NewsSource, my, word))
                        cursor.execute(sql1)  # 执行sql语句
                        db.commit()  # 提交数据
                    elif len(word) > 2000:
                        chars = ["/", "\"", "'", "·", "。", "?", "!", ",", "、", ";", ":", "‘", "’",
                                 "“", "”", "(", ")", "…", "–", ".", "《", "》"]  # 创建列表值
                        new_title = ''
                        for i in range(len(title)):  # 遍历title字符串
                            if title[i] not in chars:  # 如果字符串中存在chars列表的字符,则替换为_
                                new_title += title[i]
                            else:
                                new_title += "_"
                        exam_doc = Document()  # 创建word文档
                        heading = exam_doc.add_heading(new_title, 0)  # 写入标题,0表示word的标题等级
                        heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 居中
                        exam_doc.add_paragraph(word)  # word变量里的文字存入word文档
                        filename = "%s.docx" % new_title  # 命名word文档
                        exam_doc.save(filename)  # 保存
                        exam_doc.Close()
                        cursor = db.cursor()                    # 创建数据库游标
                        sql2 = ("INSERT INTO news(DeclareDate, Title, NewsSource, Industry, AccessoryName)\
                                  VALUES ('%s', '%s', '%s', '%s', '%s')" % (DeclareDate, title, NewsSource, my, new_title))
                        cursor.execute(sql2)  # 执行sql语句
                        db.commit()  # 提交数据
                elif len(img_list) != 0:  # img_list长度不为0
                    chars = ["/", "\"", "'", "·", "。", "?", "!", ",", "、", ";", ":", "‘", "’",
                             "“", "”", "(", ")", "…", "–", ".", "《", "》"]  # 创建列表值
                    new_title = ""
                    for i in range(len(title)):  # 遍历title字符串
                        if title[i] not in chars:  # 如果字符串中存在chars列表的字符,则替换为_
                            new_title += title[i]
                        else:
                            new_title += "_"
                    print(new_title)
                    exam_doc = Document()  # 创建word文档
                    heading = exam_doc.add_heading(new_title, 0)  # 写入标题,0表示word的标题等级
                    heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 居中
                    exam_doc.add_paragraph(word)  # word变量里的文字存入word文档
                    filename = "%s.docx" % new_title  # 命名word文档
                    exam_doc.save(filename)  # 保存
                    os.chdir("D:\Python\\xvexi")
                    if os.path.exists("A图片"):
                        shutil.rmtree("A图片")
                    os.mkdir("D:\Python\\xvexi\A图片")
                    os.chdir("D:\Python\\xvexi\A图片")
                    for a in img_list:  # 遍历img_list列表
                        a = a.replace("../../..", "http://*********.cn")  # 替换字符创建新的url
                        index = a.rfind("/")    # 用/对新的url(a)进行分拆
                        html = requests.get(a)  # 新的访问url(a)
                        b = a[index+1:]         # 用url(a)用/拆分后的最后一部分字符命名b
                        with open(b, 'wb') as file:  # 用二进制方式打开b
                            file.write(html.content)  # 写入文件
                            print(b)
                    try:
                        w = win32com.client.Dispatch('Word.Application')
                        w.Visible = 0
                        w.DisplayAlerts = 0
                        w.Documents.Open("D:\Python\\xvexi\A\\" + filename)
                        w.Run('Macro1')
                        w.Documents.Save()
                        w.Quit()
                    except:
                        print("粘贴图片失败!")
                    cursor = db.cursor()                    # 创建数据库游标
                    sql3 = ("INSERT INTO news(DeclareDate, Title, NewsSource, Industry, AccessoryName)\
                              VALUES ('%s', '%s', '%s', '%s', '%s')" % (DeclareDate, title, NewsSource, my, new_title))
                    cursor.execute(sql3)  # 执行sql语句
                    db.commit()  # 提交数据
                    os.chdir("D:\Python\\xvexi\A")
            except:
                print("网页不存在!")
    except:
        print("未获取到url!")

db.close()  # 关闭数据库

Sub Macro1()
Dim file_path, Name As String
    file_path = Dir("D:\Python\xvexi\A图片\")
    n = 0
    Do While file_path <> ""
        Name = file_path
        file_path = Dir
        Selection.Find.Execute (Name)
        Selection.MoveRight Unit:=wdCharacter, Count:=1
        If Name <> "" Then
        Selection.InlineShapes.AddPicture FileName:="D:\Python\xvexi\A图片\" + Name, LinkToFile:=False, SaveWithDocument:=True
        Else: End If
        n = n + 1
    Loop
Selection.Find.Execute "../../../*" + Name, False, False, True, False, False, True, 1, True, "", 2
End Sub

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值