import requests#网页请求
import bs4#网页解析
import re#正则表达式
import os#cmd命令
import time
def strcmp(str1,str2):
if str2:#爬虫爬的数据是空不做比较
if str1[:len(str2)].replace(' ','') == str2.replace(' ',''):
return 1
elif str2[:len(str1)].replace(' ','') == str1.replace(' ',''):
return 1
return 0
def print_article(soup):
content = soup.find('div',class_='rm_txt_con cf')
if content:
for each in content:
if each and (each.string != None):
print(each.string)
return 1
content = soup.find_all('p', style='text-indent: 2em;')
if content:
for each in soup.find_all('p', style='text-indent: 2em;'):
if each and (each.string != None):
print(each.string)
elif each.span and (each.string != None):
print(each.span.string)
return 1
def save_news(soup,newsname):#收藏新闻(新闻保存到本地)
save = input("是否收藏该新闻?\n收藏请输入0以外任何字符\n不收藏请输入0")
if save == '0':
return 0
path = 'E:/py爬虫/news/' + time.strftime("%Y%m%d") + newsname.replace('《','').replace('》','').replace('"','') + '.txt'
file = open(path,'w',encoding='utf-8')
content = soup.find('div', class_='rm_txt_con cf')
if content:
print('收藏成功')
for each in content:
if each and (each.string != None):
file.write(each.string)
file.close()
return 1
content = soup.find_all('p', style='text-indent: 2em;')
if content:
print('收藏成功')
for each in soup.find_all('p', style='text-indent: 2em;'):
if each and (each.string != None):
file.write(each.string)
elif each.span and (each.string != None):
file.write(each.span.string)
file.close()
return 1
print("收藏失败!")
def open_url(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
res = requests.get(url, headers=headers)
res.encoding = 'gbk'
return res
def get_data(res):
with open("test.txt","w",encoding="utf-8") as file:
file.write(res.text)
def get_hotnews(soup):
content = soup.find(id="hotnews")
topline_target = content.find(id="rm_topline") # 获取大标题要闻
topread_target = content.find(id="rm_topread") # 获取小标题要闻
for link in topline_target.find_all('a'):
print("大标题要闻链接",link.get('href'))#大标题要闻链接
if topline_target.string is None:#如果标题是图片
res = open_url(link.get('href'))
topline_soup = bs4.BeautifulSoup(res.text, "html.parser")
print("大标题要闻:",topline_soup.title.string)
newsname = topline_soup.title.string
else:#如果标题是字符串
print("大标题要闻:",topline_target.string)#大标题要闻标题内容
newsname = topline_target.string
for link in topread_target.find_all('a'):
print("小标题要闻链接:", link.get('href')) # 小标题要闻链接
print("小标题要闻:",link.string)#小标题要闻标题内容
key = input('输入1访问大标题要闻\n输入2访问小标题要闻\n输入3退出该界面')
if key == '1':
for link in topline_target.find_all('a'):
res = open_url(link.get('href'))
topline_soup = bs4.BeautifulSoup(res.text, "html.parser")
print_article(topline_soup)
save_news(topline_soup,newsname)
return 1
elif key == '2':
str = input("请输入你浏览的小新闻名称!")
for link in topread_target.find_all('a'):
if strcmp(str,link.string):
res = open_url(link.get('href'))
topread_soup = bs4.BeautifulSoup(res.text, "html.parser")
print_article(topread_soup)
newsname = link.string
save_news(topread_soup,newsname)
return 1
print("*" * 30)
print("您输入错误,请重试!")
print("*" * 30)
get_hotnews(soup)
elif key == '3':
return 0
else:
print("*" * 30)
print("您输入错误,请重试!")
print("*" * 30)
get_hotnews(soup)
def get_imgnews(soup):
content = soup.find_all("div",class_="swiper-slide")
for each in content:
if each.span:
each.img_name = each.span.a.string.replace('\n', '')
print(each.img_name)#replace防止html代码中间有换行
str = input("请输入你浏览的新闻名称!\n输入0退出该界面")
if str == '0':
return 0
for each in content:
if strcmp(str,each.img_name):
res = open_url(each.a.get('href'))
newsname = each.img_name
soup = bs4.BeautifulSoup(res.text, "lxml")
print_article(soup)
save_news(soup, newsname)
return 0
print("*" * 30)
print("您输入错误,请重试!")
print("*" * 30)
get_imgnews(soup)
def get_sidebarnews(soup):
content = soup.find(id="rm_aq")
res1 = open_url(content.h2.a.get('href'))#图片标题
soup1 = bs4.BeautifulSoup(res1.text, "html.parser")
print(soup1.title.string)
res2 = open_url(content.find('h2',class_ = "A6").a.get("href"))
soup2 = bs4.BeautifulSoup(res2.text, "html.parser")
print(soup2.title.string)
for each in content.find_all('li'):
if each.string != None:
print(each.string)
str = input("请输入你浏览的小新闻名称!\n输入0退出该界面")
if str == '0':
return 0
for each in content.find_all('li'):
if each.string != None:
if strcmp(str,each.string):
_url = each.a.get('href')
newsname = each.string
res = open_url(_url)
soup = bs4.BeautifulSoup(res.text, "lxml")
print_article(soup)
save_news(soup, newsname)
return 1
if strcmp(str,soup1.title.string):
_url = content.h2.a.get('href')
newsname = soup1.title.string
res = open_url(_url)
soup = bs4.BeautifulSoup(res.text, "lxml")
print_article(soup)
save_news(soup, newsname)
return 1
if strcmp(str,soup2.title.string):
_url = content.find('h2',class_ = "A6").a.get("href")
newsname = soup2.title.string
res = open_url(_url)
soup = bs4.BeautifulSoup(res.text, "lxml")
print_article(soup)
save_news(soup, newsname)
return 1
print("*" * 30)
print("您输入错误,请重试!")
print("*" * 30)
get_sidebarnews(soup)
def read_savenews():
for each in os.listdir('E:/py爬虫/news/'):
print(each.replace('.txt',''))
if not os.listdir('E:/py爬虫/news/'):
print("您暂未收藏任何新闻!")
str = input("请输入你要阅读的收藏新闻!\n输入0退出该界面")
if str == '0':
return 0
for each in os.listdir('E:/py爬虫/news/'):
if str.replace(' ','') == each.replace('.txt','').replace(' ',''):
path = 'E:/py爬虫/news/' + each
file = open(path,'r',encoding="utf-8")
print(file.read())
file.close()
return 0
print("*" * 30)
print("您输入错误,请重试!")
print("*" * 30)
read_savenews()
def make_file():
if not os.path.exists('E:/py爬虫/news'): # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs('E:/py爬虫1/news') # makedirs 创建文件时如果路径不存在会创建这个路径
def main():
url = "http://www.people.com.cn/"
res = open_url(url)
res.encoding = 'gbk'
make_file()
data = get_data(res)
soup = bs4.BeautifulSoup(res.text, "lxml")
print("欢迎使用人民网新闻自助阅读工具!\nmade by shui0")
while 1:
key = input("1:浏览标题新闻\n2:浏览动图新闻\n3:浏览边栏新闻\n4:浏览已收藏新闻\n5:退出程序\n请输入对应数字以选择对应功能!")
if key == '1':
get_hotnews(soup)
elif key == '2':
get_imgnews(soup)
elif key == '3':
get_sidebarnews(soup)
elif key == '4':
read_savenews()
elif key == '5':
exit(0)
else:
print("请输入正确的数字!")
if __name__ == "__main__":
main()
初学爬虫,手写一个爬虫代码,供各位大佬参考借鉴,如有错误,还请指正。
(备注:此版并不为最终版,作者将逐渐更新此爬虫代码)