- #!/usr/bin/python3
- # -*- coding: UTF-8 -*-
- '''
- Created on 2016年11月18日
- @author: baoyou <span style="line-height: 1.5;">curiousby@163.com</span>
- '''
- 下载
- #http://ssdfz001.iteye.com/blog/2228685
- import urllib.request
- import urllib.parse
- import os, sys
- import codecs
- import bs4
- from bs4 import BeautifulSoup
- import re
- import urllib.request, urllib.parse, http.cookiejar
- #跟网址 http://news.qq.com/c/816guonei_1.htm
- base_url='http://news.qq.com/'
- url='http://news.qq.com/c/816guonei_1.htm'
- #存储路径
- save_path='C:/Users/cmcc-B100036/Desktop/'
- save_img='img'
- save_txt='text'
- #抽取正则
- reg = '<a target=\"_blank\" class=\"pic\" href=\"([^\"]*)\"><img class=\"picto\" src=\"([^\"]*)\"></a><em class=\"f14 l24\"><a target=\"_blank\" class=\"linkto\" href=\"[^\"]*\">([^</a>]*)</a></em><p class=\"l22\">([^</p>]*)</p>'
- #request消息头
- heads = {
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Encoding':'gzip, deflate, sdch',
- 'Accept-Language':'zh-CN,zh;q=0.8',
- 'Cache-Control':'max-age=0',
- 'Host':'news.qq.com',
- 'Proxy-Connection':'keep-alive',
- 'Upgrade-Insecure-Requests':'1',
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
- }
- #获取网页信息
- def getHtml(url):
- fp = urllib.request.urlopen(url)
- bytes = fp.read()
- respAllHtml = bytes.decode('gbk')
- fp.close();
- #print('---- respAllHtml----',respAllHtml);
- return respAllHtml;
- #获取新闻列表
- def getList(url):
- respHtml = getHtml(url);
- #print('---- respHtml----',respHtml);
- soup = BeautifulSoup(respHtml ,'html.parser');
- list = soup.find_all('div',class_='Q-tpList');
- # print('-----------list .len------------',len(list));
- contents=[]
- for x in list:
- contents.append(x)
- return contents
- #获取文本信息到本地
- def loadText(contents):
- for content in contents :
- load(content)
- #下载
资源 - def load(content):
- # print(content.prettify());
- #
- # print(content.find('a',class_='pic'))
- # print(content.find('a',class_='pic')['href'])
- # print(content.find('a',class_='pic').img)
- # print(content.find('a',class_='pic').img['src'])
- # print( content.find('a',class_='linkto'))
- # print( content.find('a',class_='linkto').get_text())
- # print(content.find('p'))
- urlsuffix=content.find('a',class_='pic')['href'];
- detailurl=base_url + urlsuffix;
- detailimg= content.find('a',class_='pic').img['src'];
- detailtitle = content.find('a',class_='linkto').get_text();
- detailcontent = content.find('p').get_text();
- save_path='C:/Users/cmcc-B100036/Desktop/'
- save_path = save_path+urlsuffix.replace(".htm","");
- if not os.path.exists(save_path):
- os.makedirs( save_path, 0o755 );
- newstext = save_path+'/%s'%save_txt
- newsimg= save_path+'/%s'%save_img
- if not os.path.exists(newstext):
- os.makedirs( newstext, 0o755 );
- if not os.path.exists(newsimg):
- os.makedirs( newsimg, 0o755 );
- urllib.request.urlretrieve(detailimg,newsimg+"/img.png" );
- with codecs.open(newstext+"/text.txt",'w+','utf-8') as fp:
- fp.write(detailurl+'\t'+detailimg+'\t'+detailtitle+'\t'+detailcontent)
- #print ('------------------------------------------------------------ end one news')
- if __name__=="__main__":
- # url=raw_input("""输入目标网址\n 按回车键结束\n""")
- print ('---------------------start--------------------------------------')
- url='http://news.qq.com/c/816guonei_1.htm';
- contents = getList(url);
- loadText(contents);
- print ('---------------------end---------------------------------------')