爬虫

最新推荐文章于 2024-01-08 14:05:22 发布
DreamUrDream
最新推荐文章于 2024-01-08 14:05:22 发布
阅读量184
点赞数
本文链接：https://blog.csdn.net/weixin_42633385/article/details/90299468
版权
'''高新技术企业认定管理工作网'''

from lxml import etree
import requests, re, time, pymysql


class Spider_GXJSQYRDW_Allday:
    def __init__(self, page_total=14):  # 页面总数14页
        self.page_total = page_total
        self.url = [r'http://www.innocom.gov.cn/gxjsqyrdw/gswj/list.shtml'] + \
                   [r'http://www.innocom.gov.cn/gxjsqyrdw/gswj/list_%d.shtml' % (_ + 2) for _ in range(page_total - 1)]  # 公示公告界面url，共14页
        self.url_homepage = r'http://www.innocom.gov.cn'  # 主页url
        self.save_path = 'C:/Users/admin/Desktop/爬虫附件存放/高新技术企业认定管理工作网-公示公告/%s'
        self.date_today = time.strftime('%Y-%m-%d', time.localtime())
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
        }
        self.website_info = {
            'website_name': '高新技术企业认定管理工作网',
            'block_name': '公示公告'
        }
        self.mysql = {
            'host': 'localhost',
            'user': 'root',
            'pwd': '123456',
            'database': 'test'
        }

    def get_html(self, url):
        '''获取网页源码'''
        r = requests.get(url, headers=self.headers)
        r.encoding = 'utf-8'
        html = etree.HTML(r.text)
        return html

    def get_image(self, url, title):
        '''按顺序下载图片，图片命名按顺序为1-n.jpg(png/gif等)
        :return:
        image_save_path:图片存储路径
        save_successOrNot:图片下载成功与否
        '''
        image_save_path = []
        save_successOrNot = []
        for i, j in enumerate(url[:-1]):
            try:
                r = requests.get(self.url_homepage + j, headers=self.headers)
                image_save_path_temp = self.save_path % (title + '_' + str(i + 1) + j[-4:])
                with open(image_save_path_temp, 'wb') as f:
                    f.write(r.content)
                image_save_path.append(image_save_path_temp)
                save_successOrNot.append('下载成功')
            except:
                image_save_path.append('')
                save_successOrNot.append('下载失败')

        return image_save_path, save_successOrNot

    def get_text_attachment(self, text_url, text_attach_url, text_attach_title):
        '''按顺序下载附件
        :return:
        text_attach_save_path:附件存储路径
        save_successOrNot:附件下载成功与否
        '''
        text_attach_save_path = []
        save_successOrNot = []
        for i, j in enumerate(text_attach_url):
            try:
                text_attach_complete_url = text_url.replace('.shtml', '') + re.search(r"/files(.*?)$", j, re.I).group()
                r = requests.get(text_attach_complete_url, headers=self.headers)
                if text_attach_title[i].count('.pdf') or text_attach_title[i].count('.PDF') or text_attach_title[i].count('.doc'):
                    title = text_attach_title[i]
                else:
                    title = text_attach_title[i] + j[-4:]
                text_attach_save_path_temp = self.save_path % title
                # print(i, j)
                # print(text_attach_complete_url)
                # print(title)
                # print(text_attach_save_path_temp)

                with open(text_attach_save_path_temp, 'wb') as f:
                    f.write(r.content)
                text_attach_save_path.append(text_attach_save_path_temp)
                save_successOrNot.append('下载成功')
            except:
                text_attach_save_path.append('')
                save_successOrNot.append('下载失败')

        return text_attach_save_path, save_successOrNot

    def get_content(self):
        '''
        :return:
        title:文本标题
        date:文本发布日期
        text_url:文本网址
        text:文本正文，二进制存储
        image_exist:当前文本是否存在图片内容
        image_save_successOrNot:图片是否下载成功
        image_save_path:图片存储路径
        text_attach_exist:当前文本是否存在附件
        text_attach_successOrNot:附件是否下载成功
        text_attach_save_path:附件存储路径
        '''
        title = []
        date = []
        text_url = []
        text = []
        image_exist = []
        image_save_successOrNot = []
        image_save_path = []
        text_attach_exist = []
        text_attach_successOrNot = []
        text_attach_save_path = []
        for i in range(self.page_total): # 获取所有网页的所有文本链接
            html = self.get_html(self.url[i])
            title += html.xpath("//div[@class='listbox']//li/a/text()")
            date += html.xpath("//div[@class='listbox']//li/span/text()")
            text_url += [self.url_homepage + _ for _ in html.xpath("//div[@class='listbox']//li/a/@href")] # 获取文本链接
        for j, k in enumerate(text_url): # 遍历每个文本，爬取其内容
            html = self.get_html(k)
            text_temp = html.xpath("//div[@id='content']//text()")
            # text.append(''.join(text_temp).encode('utf-8')) # 获取文本二进制数据
            text.append(''.join(text_temp)) # 获取文本

            # 图片是否存在判断，下载及保存
            image_url = html.xpath("//div[@id='content']//img/@src")
            if image_url == [] or len(image_url) == 1:
                image_exist.append('无图片')
                image_save_successOrNot.append('')
                image_save_path.append('')
            else:
                img_save_path, img_save_successOrNot = self.get_image(image_url, title[j])
                image_exist.append('有%d张图片' % (len(image_url) - 1))
                image_save_successOrNot.append(' & '.join(img_save_successOrNot))
                image_save_path.append(' & '.join(img_save_path))

            # 附件文档是否存在判断，下载及保存
            text_attach_url = html.xpath("//div[@id='content']//a/@href")
            text_attach_title = html.xpath("//div[@id='content']//a/text()")
            if text_attach_url == []:
                text_attach_exist.append('无附件')
                text_attach_successOrNot.append('')
                text_attach_save_path.append('')
            else:
                text_atta_save_path, text_atta_successOrNot = self.get_text_attachment(k, text_attach_url, text_attach_title)
                text_attach_exist.append('有%d个附件' % (len(text_attach_url)))
                text_attach_successOrNot.append(' & '.join(text_atta_successOrNot))
                text_attach_save_path.append(' & '.join(text_atta_save_path))
                # print(text_atta_save_path)
                # print('-' * 50)

        return title, date, text_url, text, text_attach_exist, text_attach_save_path, text_attach_successOrNot, image_exist, image_save_path, image_save_successOrNot

    def to_database(self):
        '''写入数据库'''
        db = pymysql.connect(
            host=self.mysql['host'],
            user=self.mysql['user'],
            password=self.mysql['pwd'],
            database=self.mysql['database'],
            charset='utf8'
        )
        cursor = db.cursor()
        sql = "insert into policy_info(website_name, " \
              "website_block_name, " \
              "text_title, " \
              "text_date," \
              "text_url, " \
              "text_content, " \
              "text_attachment_exist, " \
              "text_attachment_save_path," \
              "get_text_atta_okornot, " \
              "image_exist, " \
              "image_save_path, " \
              "get_image_okornot) " \
              "values (%s, %s, %s, str_to_date(%s, '%%Y-%%m-%%d'), %s, %s, %s, %s, %s, %s, %s, %s);"

        text_info = self.get_content()
        for i in range(len(text_info[0])):
            cursor.execute(sql, (self.website_info['website_name'],
                                 self.website_info['block_name'],
                                 text_info[0][i],
                                 text_info[1][i],
                                 text_info[2][i],
                                 text_info[3][i],
                                 text_info[4][i],
                                 text_info[5][i],
                                 text_info[6][i],
                                 text_info[7][i],
                                 text_info[8][i],
                                 text_info[9][i],
                                 ))

        db.commit()
        cursor.close()
        db.close()

        return text_info

def main_GXJSQYRDW_Allday():
    spider = Spider_GXJSQYRDW_Allday()
    spider.to_database()

if __name__ == '__main__':
    main_GXJSQYRDW_Allday()
DreamUrDream
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
爬虫

'''高新技术企业认定管理工作网'''from lxml import etreeimport requests, re, time, pymysqlclass Spider_GXJSQYRDW_Allday: def __init__(self, page_total=14): # 页面总数14页 self.page_total = page_total...
复制链接

扫一扫