BeautifulSoup 下载
https://www.crummy.com/software/BeautifulSoup/bs4/download/4.1/
本人的python 安装在I:\software\Python27\beautifulsoup4-4.1.3
按住ctrl 右键 在此处打开命令行
dir 存在 setup.py
setup.py bulid
setup.py install
验证如下: 就没有问题
# -*- coding: UTF-8 -*-
'''
Created on 2016年8月1日
@author: cmcc-B100036
'''
#http://ssdfz001.iteye.com/blog/2228685
import urllib2,os,codecs
from bs4 import BeautifulSoup
#跟网址 http://news.qq.com/c/816guonei_1.htm
url='http://news.qq.com/c/816guonei_1.htm'
#存储路径
save_path='I:/software/Python27/pythonData/'
save_img='text.text'
save_txt='png.png'
#抽取正则
reg = '<a target=\"_blank\" class=\"pic\" href=\"([^\"]*)\"><img class=\"picto\" src=\"([^\"]*)\"></a><em class=\"f14 l24\"><a target=\"_blank\" class=\"linkto\" href=\"[^\"]*\">([^</a>]*)</a></em><p class=\"l22\">([^</p>]*)</p>'
#request消息头
heads = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Host':'news.qq.com',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
}
#获取网页信息
def getHtml(url):
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
urllib2.install_opener(opener)
req = urllib2.Request(url)
opener.addheaders = heads.items()
respHtml = opener.open(req).read()
return respHtml;
#获取新闻列表
def getList(url):
contents=[]
respHtml = getHtml(url)
soup = BeautifulSoup(respHtml,from_encoding="gb2312")
list = soup.find_all('div','class_=Q-tpList')
for x in list:
contents.append(x)
return contents
#获取文本信息到本地
def loadText(contents):
for content in contents :
load(content)
#下载资源
def load(content):
soup = BeautifulSoup(content,from_encoding="gb2312")
newsdetailname=soup.find_all('a','class_=pic').attrs["href"].get_text().replace('.htm','')
newsimagpichref= soup.find_all('img','class_=picto').attrs["src"].get_text()
newstitle = soup.find_all('a','class_=linkto').get_text()
newscontent = soup.find_all('p','class_=112').get_text()
save_path += newsdetailname;
if not os.path.exists(save_path):
os.mkdir(save_path)
newstext = save_path+'\%s'%save_txt
newsimg= save_path+'\%s'%save_img
if not os.path.exists(newstext):
os.mkdir(newstext)
if not os.path.exists(newsimg):
os.mkdir(newsimg)
imgsrc= urllib2.urlopen(newsimagpichref).read()
with codecs.open(newsimg,"a+", "gb2312") as fp:
fp.write(imgsrc)
with codecs.open(newstext,'r','gb2312') as fp:
fp.write(newsimagpichref+'\t'+newstitle+'\t'+newscontent+'\t')
print '------------------------------------------------------------ end one news'
if __name__=="__main__":
# url=raw_input("""输入目标网址\n 按回车键结束\n""")
url='http://news.qq.com/c/816guonei_1.htm'
contents = getList(url)
loadText(contents)
package com.curiousby.python.demo;
import org.python.util.PythonInterpreter;
/**
* @author baoyou E-mail:curiousby@163.com
* @version 2016年8月1日 下午1:05:36
* desc: ...
*/
public class PythonByJava2 {
public static void main(String[] args) {
PythonInterpreter interpreter = new PythonInterpreter();
interpreter.execfile("I:\\cache\\ea-ws\\DemoJava\\conf\\newsqq.py");
}
}
捐助开发者
在兴趣的驱动下,写一个免费
的东西,有欣喜,也还有汗水,希望你喜欢我的作品,同时也能支持一下。 当然,有钱捧个钱场(右上角的爱心标志,支持支付宝和PayPal捐助),没钱捧个人场,谢谢各位。
谢谢您的赞助,我会做的更好!