爬取某校重要新闻

最新推荐文章于 2024-09-03 17:37:40 发布

cynicme

最新推荐文章于 2024-09-03 17:37:40 发布

阅读量111

点赞数 2

文章标签： python windows 开发语言

本文链接：https://blog.csdn.net/cynicme/article/details/140562550

版权

爬取某校重要新闻

代码

代码

# import requests
# from bs4 import BeautifulSoup
# import os
#
# def get_html(url):
#     """Get the content of the URL"""
#     response = requests.get(url)
#     response.encoding = 'utf-8'
#     return response.text
#
# def get_article_links_and_titles(html):
#     """Extract article titles and links from HTML"""
#     soup = BeautifulSoup(html, 'lxml')
#
#     # 查找包含ArticleList类的table
#     article_list_table = soup.find('table', {'class': 'ArticleList'})
#
#     articles = []
#     if article_list_table:
#         rows = article_list_table.find_all('tr')
#         for row in rows:
#             title_cell = row.find('a')  # 假设标题在a标签内
#             if title_cell and title_cell.has_attr('href'):
#                 title = title_cell.get_text(strip=True)
#                 link = title_cell['href']
#                 articles.append((title, link))
#     return articles
#
# def get_article_content(url):
#     """Get the content of an article given its URL"""
#     article_html = get_html(url)
#     soup = BeautifulSoup(article_html, 'lxml')
#
#     # 根据实际情况修改选择器以找到具体内容
#     content_div = soup.find('div', {'class': 'v_news_content'})
#     if content_div:
#         return content_div.get_text(strip=True)
#     else:
#         return "No content found"
#
# def save_articles_to_files(articles, base_url, start_index=1):
#     """Save each article's title and content to a separate txt file"""
#     if not os.path.exists('articles'):
#         os.makedirs('articles')
#
#     for idx, (title, link) in enumerate(articles):
#         full_link = requests.compat.urljoin(base_url, link)
#         content = get_article_content(full_link)
#         filename = f'articles/article_{start_index + idx}.txt'
#         with open(filename, 'w', encoding='utf-8') as file:
#             file.write(f"Title: {title}\n\n")
#             file.write(f"Link: {full_link}\n\n")
#             file.write(content)
#
# url_web = "https://news.shu.edu.cn/index/zyxw.htm"
# html = get_html(url_web)
# articles = get_article_links_and_titles(html)
# start_index = 1
# if articles:
#     save_articles_to_files(articles, url_web, start_index)
#     print(f"Saved {len(articles)} articles to the 'articles' directory.")
# else:
#     print("No articles found.")
# for i in range(51):
#     url_web = "xxx"+f"{150 - i}"+".htm"
#     html = get_html(url_web)
#     articles = get_article_links_and_titles(html)
#     start_index = 41 + i*40
#     if articles:
#         save_articles_to_files(articles, url_web, start_index)
#         print(f"Saved {len(articles)} articles to the 'articles' directory.")
#     else:
#         print("No articles found.")
import os
import hashlib


def hash_file(filename):
    """返回文件的MD5哈希值"""
    hasher = hashlib.md5()
    with open(filename, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()


def find_and_delete_duplicates(directory):
    """查找并删除指定目录中的重复txt文件，并重新命名保留的文件"""
    files_seen = {}
    files_deleted = 0
    files_kept = 0

    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            filehash = hash_file(filepath)

            if filehash in files_seen:
                os.remove(filepath)
                files_deleted += 1
                print(f"删除重复文件: {filepath}")
            else:
                files_seen[filehash] = filepath
                files_kept += 1
                new_filename = f"通知_{files_kept}.txt"
                new_filepath = os.path.join(directory, new_filename)
                os.rename(filepath, new_filepath)
                print(f"保留并重命名文件: {new_filepath}")

    print(f"已删除 {files_deleted} 个重复文件。")
    print(f"已重命名 {files_kept} 个文件。")


# 指定包含txt文件的目录
directory = 'articles'
find_and_delete_duplicates(directory)