python3.5 beautiful4.4 扣扣国内新闻爬虫-CSDN博客

本文链接：https://blog.csdn.net/gshsbb/article/details/53232250

Java代码

#!/usr/bin/python3
# -*- coding: UTF-8 -*-
'''
Created on 2016年11月18日
@author: baoyou curiousby@163.com
'''
下载
#http://ssdfz001.iteye.com/blog/2228685
import urllib.request
import urllib.parse
import os, sys
import codecs
import bs4
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.parse, http.cookiejar
#跟网址 http://news.qq.com/c/816guonei_1.htm
base_url='http://news.qq.com/'
url='http://news.qq.com/c/816guonei_1.htm'
#存储路径
save_path='C:/Users/cmcc-B100036/Desktop/'
save_img='img'
save_txt='text'
#抽取正则
reg = '<a target=\"_blank\" class=\"pic\" href=\"([^\"]*)\"><img class=\"picto\" src=\"([^\"]*)\"></a><a target=\"_blank\" class=\"linkto\" href=\"[^\"]*\">([^</a>]*)</a>([^]*)'
#request消息头
heads = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Host':'news.qq.com',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
}
#获取网页信息
def getHtml(url):
fp = urllib.request.urlopen(url)
bytes = fp.read()
respAllHtml = bytes.decode('gbk')
fp.close();
#print('---- respAllHtml----',respAllHtml);
return respAllHtml;
#获取新闻列表
def getList(url):
respHtml = getHtml(url);
#print('---- respHtml----',respHtml);
soup = BeautifulSoup(respHtml ,'html.parser');
list = soup.find_all('div',class_='Q-tpList');
# print('-----------list .len------------',len(list));
contents=[]
for x in list:
contents.append(x)
return contents
#获取文本信息到本地
def loadText(contents):
for content in contents :
load(content)
#下载
资源
def load(content):
# print(content.prettify());
#
# print(content.find('a',class_='pic'))
# print(content.find('a',class_='pic')['href'])
# print(content.find('a',class_='pic').img)
# print(content.find('a',class_='pic').img['src'])
# print( content.find('a',class_='linkto'))
# print( content.find('a',class_='linkto').get_text())
# print(content.find('p'))
urlsuffix=content.find('a',class_='pic')['href'];
detailurl=base_url + urlsuffix;
detailimg= content.find('a',class_='pic').img['src'];
detailtitle = content.find('a',class_='linkto').get_text();
detailcontent = content.find('p').get_text();
save_path='C:/Users/cmcc-B100036/Desktop/'
save_path = save_path+urlsuffix.replace(".htm","");
if not os.path.exists(save_path):
os.makedirs( save_path, 0o755 );
newstext = save_path+'/%s'%save_txt
newsimg= save_path+'/%s'%save_img
if not os.path.exists(newstext):
os.makedirs( newstext, 0o755 );
if not os.path.exists(newsimg):
os.makedirs( newsimg, 0o755 );
urllib.request.urlretrieve(detailimg,newsimg+"/img.png" );
with codecs.open(newstext+"/text.txt",'w+','utf-8') as fp:
fp.write(detailurl+'\t'+detailimg+'\t'+detailtitle+'\t'+detailcontent)
#print ('------------------------------------------------------------ end one news')
if __name__=="__main__":
# url=raw_input("""输入目标网址\n 按回车键结束\n""")
print ('---------------------start--------------------------------------')
url='http://news.qq.com/c/816guonei_1.htm';
contents = getList(url);
loadText(contents);
print ('---------------------end---------------------------------------')