爬虫2
爬虫1-bs解析
‘’’
find :找出第一个标签
find_all:找出所有的标签
‘’’
‘’’
1、string:获取某个标签下的非标签字符串,返回来的是个字符串
2、strings:获取某个标签下的子孙非标签字符串,返回来的是个生成器
3、stripped_strings:获取某个标签下的子孙非标签字符串,会去掉空白字符串,返回来的是个生成器
4、get_text:获取某个标签下的子孙非标签字符串,不是以列表的形式返回,是以普通字符串返回
‘’’
#Author lpf
#usr/bin/src
html = """
<!doctype html><html class="nx-main980" >
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE8" />
<meta name="Description" content="人人网 校内是一个真实的社交网络,联络你和你周围的朋友。 加入人人网校内你可以:联络朋友,了解他们的最新动态;和朋友分享相片、音乐和电影;找到老同学,结识新朋友;用照片和日志记录生活,展示自我。" />
<meta name="Keywords" content="Xiaonei,Renren,校内,大学,同学,同事,白领,个人主页,博客,相册,群组,社区,交友,聊天,音乐,视频,校园,人人,人人网" />
<meta property="qc:admins" content="232517306762562566375" />
<meta property="wb:webmaster" content="f2fdc876b8ba2a5d" />
<meta name="msApplication-ID" content="App" />
<meta name="msApplication-PackageFamilyName" content="57722RenRenpreview.RenrenHD_fknrsfzqca1jw" /><link rel="shortcut icon" type="image/x-icon" href="http://a.xnimg.cn/favicon-rr.ico?ver=3" />
<link rel="apple-touch-icon" href="http://a.xnimg.cn/wap/apple_icon_.png" />
<script type="text/javascript">
XN = {get_check:'',get_check_x:'bc40edb7',env:{domain:'renren.com',shortSiteName:'人人',siteName:'人人网'}};
try {
window.onerror=function(){
var a=arguments,e=encodeURIComponent,l=location,d=new Date();
if(a.length!=3||a[2]==0)return 1;
new Image().src=['http://s.renren.com/speedstats/jserror/stats.php?message='+e(a[0]),
'url='+e(a[1]),'lineNo='+a[2],'location='+e(l),'time='+d.toLocaleTimeString()].join('&');
return 1;
};
} catch (e){};
</script><meta charset="utf-8"/>
<link rel="shortcut icon" type="image/x-icon" href="http://a.xnimg.cn/favicon-rr.ico?ver=3" />
<link rel="apple-touch-icon" href="http://a.xnimg.cn/wap/apple_icon_.png" />
<link rel="stylesheet" type="text/css" href="http://s.xnimg.cn/a86614/nx/core/base.css">
<script type="text/javascript">
if(typeof nx === 'undefined'){
var nx = {};
}
nx.log = {
startTime : + new Date()
};
nx.user = {
id : "",
ruid:"",
tinyPic : " ",
name : "",
privacy: "",
requestToken : '',
_rtk : 'bc40edb7'
};nx.user.isvip = false;nx.user.hidead = false;nx.webpager = nx.webpager || {};
nx.production = true;
</script>
<script type="text/javascript" src="http://s.xnimg.cn/a83151/nx/core/libs.js"></script>
<script type="text/javascript">
define.config({map:{
"backbone":"http://s.xnimg.cn/a75208/nx/core/backbone.js",
"ui/draggable":"http://s.xnimg.cn/a70750/nx/core/ui/draggable.js",
"ui/menu":"http://s.xnimg.cn/a70736/nx/core/ui/menu.js",
"ui/resizable":"http://s.xnimg.cn/a70738/nx/core/ui/resizable.js",
"ui/sortable":"http://s.xnimg.cn/a70749/nx/core/ui/sortable.js",
"ui/tabs":"http://s.xnimg.cn/a78333/nx/core/ui/tabs.js",
"ui/ceiling":"http://s.xnimg.cn/a76297/nx/core/ui/ceiling.js",
"ui/columns":"http://s.xnimg.cn/a68070/nx/core/ui/columns.js",
"ui/dialog":"http://s.xnimg.cn/a76395/nx/core/ui/dialog.js",
"ui/fileupload":"http://s.xnimg.cn/a81310/nx/core/ui/fileupload.js",
"ui/pagination":"http://s.xnimg.cn/a70307/nx/core/ui/pagination.js",
"ui/placeholder":"http://s.xnimg.cn/a79685/nx/core/ui/placeholder.js",
"ui/progressbar":"http://s.xnimg.cn/a62964/nx/core/ui/progressbar.js",
"ui/rows":"http://s.xnimg.cn/a62964/nx/core/ui/rows.js",
"ui/scroll":"http://s.xnimg.cn/a61518/nx/core/ui/scroll.js",
"ui/scrollbar":"http://s.xnimg.cn/a76868/nx/core/ui/scrollbar.js",
"ui/select":"http://s.xnimg.cn/a82096/nx/core/ui/select.js",
"ui/slideshow":"http://s.xnimg.cn/a72804/nx/core/ui/slideshow.js",
"ui/speech":"http://s.xnimg.cn/a77631/nx/core/ui/speech.js",
"ui/textbox":"http://s.xnimg.cn/a79526/nx/core/ui/textbox.js",
"ui/renren/textbox":"http://s.xnimg.cn/a92727/nx/core/ui/renren/textbox.js",
"ui/tooltip":"http://s.xnimg.cn/a73228/nx/core/ui/tooltip.js",
"ui/renren/addfriend":"http://s.xnimg.cn/a78457/nx/core/ui/renren/addFriendLayer.js",
"ui/renren/at":"http://s.xnimg.cn/a78409/nx/core/ui/renren/atAndEmotion.js",
"ui/renren/emotion":"http://s.xnimg.cn/a78409/nx/core/ui/renren/atAndEmotion.js",
"ui/renren/commentCenter":"http://s.xnimg.cn/a83569/nx/core/ui/renren/commentCenter.js",
"ui/renren/friendgroup":"http://s.xnimg.cn/a62964/nx/core/ui/renren/friendGroup.js",
"ui/renren/friendListSelector":"http://s.xnimg.cn/a78513/nx/core/ui/renren/friendListSelector.js",
"ui/renren/like":"http://s.xnimg.cn/a83569/nx/core/ui/renren/like.js",
"nx/namecard":"http://s.xnimg.cn/a77668/nx/core/ui/renren/namecard.js",
"ui/renren/pagelayer":"http://s.xnimg.cn/a62964/nx/core/ui/renren/pageLayer.js",
"ui/renren/photoupload":"http://s.xnimg.cn/a82833/nx/core/ui/renren/photoupload.js",
"ui/renren/privacy":"http://s.xnimg.cn/a76680/nx/core/ui/renren/privacy.js",
"ui/renren/share":"http://s.xnimg.cn/a78999/nx/core/ui/renren/share.js",
"ui/renren/vocal":"http://s.xnimg.cn/a77347/nx/core/ui/renren/vocal.js",
"ui/renren/mvideo":"http://s.xnimg.cn/a80641/nx/core/ui/renren/mvideo.js",
"ui/renren/with":"http://s.xnimg.cn/a82994/nx/core/ui/renren/with.js",
"ui/clipboard":"http://s.xnimg.cn/a63417/nx/core/ui/clipboard.js",
"app/publisher":"http://s.xnimg.cn/a91505/nx/core/app/publisher.js",
"viewer":"http://s.xnimg.cn/a83025/nx/photo/viewer/js/viewer.js",
"media/player": "http://s.xnimg.cn/nx/photo/viewer/js/mediaplayer.js",
"ui/renren/like/commentseed":"http://s.xnimg.cn/a64636/nx/core/ui/renren/like.seed.comment.js",
"ui/renren/like/seed":"http://s.xnimg.cn/a62964/nx/core/ui/renren/like.seed.js",
"ui/renren/share/seed":"http://s.xnimg.cn/a62964/nx/core/ui/renren/share.seed.js",
"ui/renren/follow":"http://s.xnimg.cn/a78456/nx/core/ui/renren/follow.js",
"ui/renren/relationFollow":"http://s.xnimg.cn/a78457/nx/core/ui/renren/relationFollow.js",
"ui/autocomplete":"http://s.xnimg.cn/a70736/nx/core/ui/autocomplete.js",
"ui/showCommonFriend":"http://s.xnimg.cn/a78917/nx/core/ui/renren/showcommonfriend.js",
"photo/circler":"http://s.xnimg.cn/a73344/nx/photo/phototerminal/js/circler.js",
"ui/friendSearch":"http://s.xnimg.cn/a64338/nx/core/ui/renren/friendSearch.js",
"ui/renren/replyOption":"http://s.xnimg.cn/a68256/nx/core/ui/renren/replyOption.js",
"photo/avatarUpload": "http://s.xnimg.cn/a77340/nx/photo/upload-avata/js/avatarUpload.js",
"ui/renren/school":"http://s.xnimg.cn/a85689/nx/core/ui/renren/school.js"
}});
nx.data.isDoubleFeed = Boolean();
nx.data.isDoubleFeedGuide = Boolean();
</script>
<script type="text/javascript" src="http://s.xnimg.cn/a95943/nx/core/base.js"></script>
<!--[if lt IE 9]>
<script type="text/javascript">
document.execCommand("BackgroundImageCache", false, true);
</script>
<![endif]-->
<script type="text/javascript">nx.webpager.disable = true;</script>
<link href="login.css" rel="stylesheet" type="text/css" media="all" />
<title>人人网,中国领先的实名制SNS社交网络。加入人人网,找到老同学,结识新朋友。</title><script src="http://s.xnimg.cn/a72842/n/core/base-all2.js" type="text/javascript"></script>
</head>
<body id="syshome" class="login">
<div id="header">
<div id="navBar" class="site-nav rr">
<div class="navigation-wrapper">
<div class="navigation navigation-new clearfix">
<div id="logo2">
<h1>
<a href="http://www.renren.com" title="人人网 renren.com - 人人网校内是一个真实的社交网络,联系朋友,一起玩游戏">
<img alt="人人网 renren.com - 人人网校内是一个真实的社交网络,联系朋友,一起玩游戏" title="人人网 renren.com - 人人网校内是一个真实的社交网络,联系朋友,一起玩游戏" src="http://a.xnimg.cn/nx/apps/login/cssimg/logo-big.jpg" />
</a>
</h1>
</div>
<div class="nav-body clearfix">
<div class="nav-other">
<div class="menu">
<a href="http://st.renren.com" target="_blank" class="st-btn">学生团体申请入口</a>
</div>
<div class="menu">
<a id="reg_link" title="注册" stats="homenav_reg" href="http://wwv.renren.com/xn.do?ss=10131&rt=1&g=v6reg">注册</a>
</div>
<div class="menu">
<a title="给我们提建议" stats="homenav_suggest" href="http://support.renren.com/link/suggest">反馈意见</a>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<div id="opi" class="page-wrapper clearfix">
<div class="full-page-holder">
<div class="full-page"><div class="login-page clearfix login-wrap">
<div class="side-column login-box">
<div class="login-panel ">
<div class="radiusimg">
<div class="shadow"></div>
<div class="pic"><img src="" id="personhead"/></div>
<!-- <img src="http://a.xnimg.cn/nx/apps/login/cssimg/person.jpg"> -->
</div>
<span id="errorMessage" class="errors_div" style="display:none;"></span>
<div class="yellow-error" id="yellow_error">
<a class="close" href="javascript:closeError();"></a>
<p class="wrong">您的用户名和密码不匹配</p>
<p class="worp">为了账号安全,已向您的邮箱: <strong id="sendemail"></strong>发送了一封确认信,请通过邮件内链接登录。</p>
<p class="m-26"><a id="gotoEmail" href="#" target="_blank">打开邮箱查收确认信</a></p>
<p class="m-26"><a href="javascript:closeError();">重新输入</a></p>
</div>
<div class="yellow-error" id="account_stop">
<a class="close" href="javascript:closeStop();"></a>
<p class="wrong"></p>
<p class="center">您的账号已停止使用,如有疑问请联系<a style="background:none;padding-left:0;" target="_blank" href="http://help.renren.com/#http://help.renren.com/support/contomvice?pid=2&selection={couId:193,proId:342,cityId:1000375}">客服</a></p>
</div>
<div class="yellow-error" id="account_lock">
<a class="close" href="javascript:closeLock();"></a>
<p class="wrong">您的账号由于以下某种原因需要解锁才能登录</p>
<ol>
<li>删除过账号</li>
<li>长时间没有登录网站</li>
<li>安全原因</li>
</ol>
<p class="center"><a href="http://safe.renren.com/relive.do">立即解锁</a></p>
</div>
<form method="post" id="loginForm" class="login-form" action="http://www.renren.com/PLogin.do">
<dl class="top clearfix">
<dd>
<input type="text" name="email" class="input-text" id="email" tabindex="1" value="" />
</dd>
</dl>
<dl class="pwd clearfix">
<dd>
<input type="password" id="password" name="password" error="请输入密码" class="input-text" tabindex="2"/>
<label class="pwdtip" id="pwdTip" for="password">请输入密码</label>
<a class="forgetPwd" id="forgetPwd" href="http://safe.renren.com/findPass.do" stats="home_findpassword">忘记密码?</a>
</dd>
</dl>
<div class="caps-lock-tips" id="capsLockMessage" style="display:none"></div>
<dl class="savepassword clearfix">
<dt>
<label title="为了确保您的信息安全,请不要在网吧或者公共机房勾选此项!" for="autoLogin" class="labelCheckbox">
<input type="checkbox" name="autoLogin" id="autoLogin" value="true" tabindex="4" />下次自动登录
</label>
</dt>
<dd>
<span class="getpassword" id="getpassword"><a href="http://safe.renren.com/findPass.do" stats="home_findpassword">忘记密码?</a></span>
</dd>
</dl>
<dl id="code" class="code clearfix">
<dt><label for="code">验证码:</label></dt>
<dd>
<input id="icode" type="text" name="icode" class="input-text" tabindex="3" autocomplete="off" />
<label class="codetip" id="codeTip" for="icode">请输入验证码</label>
</dd>
</dl>
<dl id="codeimg" class="codeimg clearfix">
<dt></dt>
<dd><img id="verifyPic_login" src="http://icode.renren.com/getcode.do?t=web_login&rnd=Math.random()"/>
</dd>
<a class="changeone" href="javascript:refreshCode_login();" >换一个</a>
</dl>
<dl class="bottom">
<input type="hidden" name="origURL" value="http://www.renren.com/880151247/profile" />
<input type="hidden" name="domain" value="renren.com" />
<input type="hidden" name="key_id" value="1" />
<input type="hidden" name="captcha_type" id="captcha_type" value="web_login" />
<input type="submit" id="login" class="input-submit login-btn" stats="loginPage_login_button" value="登录" tabindex="5"/>
</dl>
</form>
<div class="regnow">
<input type="button" onclick="window.location='http://wwv.renren.com/xn.do?ss=10131&rt=1&g=v6reg'" id="regnow" class="input-button login-btn regbutton" value="注册" tabindex="6" stats="loginPage_signUp_button" />
</div>
<p class="third-party-title"><span class="underscore left"></span><span class="text">第三方登录</span><span class="underscore right"></span></p>
<div class="login_corp" >
<div class="Third-partyi-login">
<a title="微信" class="login-item weixin" href="http://www.renren.com/api/jump?src=wx" id="login_weixin" stats="loginPage_weixin_link"></a>
<a title="QQ" class="login-item qq" href="http://www.renren.com/api/jump?src=qq" id="login_qq" stats="loginPage_qq_link"></a>
<a title="微博" class="login-item weibo" href="http://www.renren.com/api/jump?src=wb" id="login_weibo" stats="loginPage_weibo_link"></a>
</div>
</div>
<div class="other-login clearfix">
<div class="login-word login-item">其它账号登录:</div>
<a title="移动" class="login-item yidong" href="https://open.mmarket.com:443/omee-aus/services/oauth/authorize?responseType=code&scope=getUserInfo&clientId=300007884008&redirectUri=http%3A%2F%2Fwww.renren.com%2Fbind%2Fcnmobile%2FloginCallBack&clientState=9" id="login_cnmobile" stats="loginPage_baidu_link"></a>
<a title="天翼" class="login-item tianyi" id="login_tianyi" href="https://oauth.api.189.cn/emp/oauth2/authorize?app_id=296961050000000294&response_type=code&redirect_uri=http://www.renren.com/bind/ty/tyLoginCallBack" stats="loginPage_tianyi_link"></a>
<a title="360" class="login-item lo360" id="login_360" href="https://openapi.360.cn/oauth2/authorize?client_id=5ddda4458747126a583c5d58716bab4c&response_type=code&redirect_uri=http://www.renren.com/bind/tsz/tszLoginCallBack&scope=basic&display=default" stats="loginPage_360_link"></a>
<a title="百度" class="login-item baidu" href="https://openapi.baidu.com/oauth/2.0/authorize?response_type=code&client_id=foRRWjPq8In3SIhmKQw1Pep3&redirect_uri=http%3A%2F%2Fwww.renren.com%2Fbind%2Fbaidu%2FbaiduLoginCallBack" id="login_baidu" stats="loginPage_baidu_link"></a>
</div>
</div>
</div>
<div class="main-column">
<div id="mainRecommend" class="main-recommend">
<div id="ad100000000061" data-pv="h01" class="wwwad"></div>
<!--<script>
load_jebe_ads(1)
</script>-->
<div class="login-recommend clearfix">
<div class="intro">
<div class="item">
<a class="qrcode content" href="#nogo" target="_blank"></a>
</div>
<div class="item">
<a class="phone content" href="http://zhibo.renren.com/client" target="_blank"></a>
</div>
<div class="item">
<!--<a class="pad content" href="http://2014.renren.com/ipad" target="_blank"></a> -->
<a class="pad content" href="http://down.renren.com/pczbzs/rrzb_Setup-13.exe" target="_blank"> </a>
</div>
<div class="item">
<a class="other content" href="http://2014.renren.com/" target="_blank"></a>
</div>
<div class="item">
<a class="music content" href="http://musics.renren.com/" target="_blank"> </a>
</div>
<div class="item">
<a class="game content" href="http://renren-game.renren.com" target="_blank"> </a>
</div>
</div>
</div>
</div>
</div>
</div></div>
</div>
</div>
<div class="ft-wrapper clearfix">
<p>
<strong>玩转人人</strong>
<a href="http://page.renren.com/register/regGuide/" target="_blank">公共主页</a>
<a href="http://zhibo.renren.com" target="_blank">美女直播</a>
<a href="http://support.renren.com/helpcenter" target="_blank">客服帮助</a>
<a href="http://www.renren.com/siteinfo/privacy" target="_blank">隐私</a>
</p>
<p>
<strong>商务合作</strong>
<a href="http://page.renren.com/marketing/index" target="_blank">品牌营销</a>
<a href="http://bolt.jebe.renren.com/introduce.htm" class="l-2" target="_blank">中小企业<br />自助广告</a>
<a href="http://dev.renren.com/" target="_blank">开放平台</a>
<a href="http://dsp.renren.com/dsp/index.htm" target="_blank">人人DSP</a>
</p>
<p>
<strong>公司信息</strong>
<!--<a href="http://www.renren-inc.com/zh/product/renren.html" target="_blank">关于我们</a>-->
<a href="http://www.donews.com/commom/aboutus" target="_blank">关于我们</a>
<a href="http://page.renren.com/gongyi" target="_blank">人人公益</a>
<a href="http://www.renren-inc.com/zh/hr/" target="_blank">招聘</a>
<a href="#nogo" id="lawInfo">法律声明</a>
<!--<a href="http://s.xnimg.cn/a92221/wap/mobile/Reporting/index.html" target="_blank">举报流程</a>-->
<a href="http://s.xnimg.cn/a95941/wap/mobile/Reporting/index.html" target="_blank">举报流程</a>
</p>
<p>
<strong>友情链接</strong>
<a href="http://fenqi.renren.com/" target="_blank">人人分期</a>
<a href="https://licai.renren.com/" target="_blank">人人理财</a>
<a href="http://www.woxiu.com/" target="_blank">我秀</a>
<a href="http://zhibo.renren.com/" target="_blank">人人直播</a>
<a href="http://www.renren.com/" target="_blank">人人网</a>
<a href="https://www.kaixin.com" target="_blank">开心汽车</a>
</p>
<p>
<strong>人人移动客户端下载</strong>
<a href="http://mobile.renren.com/showClient?v=platform_rr&psf=42064" target="_blank">iPhone/Android</a>
<a href="http://mobile.renren.com/showClient?v=platform_hd&psf=42067" target="_blank">iPad客户端</a>
<a href="http://mobile.renren.com" target="_blank">其他人人产品</a>
</p>
<!--<p class="copyright-info">-->
<!-- 临时添加公司信息用 -->
<p class="copyright-info" style="margi-left: -20px">
<span>公司全称:北京斗牛士文化传媒有限公司</span>
<span>公司电话:010-60845018</span>
<span><a href="mailto:kefu@renren.com">公司邮箱:kefu@renren.com</a></span>
<span>公司地址:北京市海淀区宝盛东路多牛传媒中心</span>
<span>违法和不良信息举报电话:024-31160919</span>
<span><a href="http://jb.ccm.gov.cn/" target="_blank">12318全国文化市场举报网站</a></span>
<span><a target="_blank" href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=11010802029038">京公网安备 11010802029038号</a></span>
<span><a href="http://report.12377.cn:13225/toreportinputNormal_anis.do" target="_blank"><img style="height: 22px;float: left; margin-left: 78px;" src="http://s.xnimg.cn/imgpro/civilization/jubaologoNew.png">网上有害信息举报中心</a></span>
<span><img id="wenhuajingying_icon" style="height: 28px;float: left; margin-left: 60px;" src="http://s.xnimg.cn/imgpro/civilization/wenhuajingying.png"/><a href="http://s.xnimg.cn/imgpro/xkz.png" target="_blank">京网文[2018]2361-237号</a></span>
<span><a href="http://s.xnimg.cn/imgpro/icp.png" target="_blank">京ICP证1510088号</a></span>
<span>人人网©2016</span>
<span><img src="http://a.xnimg.cn/imgpro/black-logo.png" style="vertical-align: text-top;"></span>
</p>
</div>
<!-- dj埋码 -->
<script type="text/javascript">
function sendStats(url){var n="log_"+(new Date).getTime();var c=window[n]=new Image;c.onload=c.onerror=function(){window[n]=null};c.src=url;c=null}
function goPAGE() {
var currentUrl = window.location.href.split('#')[0];
if ((navigator.userAgent.match(/(phone|pad|pod|iPhone|iPod|ios|Android|Mobile|BlackBerry|IEMobile|MQQBrowser|JUC|Fennec|wOSBrowser|BrowserNG|WebOS|Symbian|Windows Phone)/i))) {
return "wap";
}
else {
return "pc";
}
}
var judge = goPAGE();
(function(){
sendStats('http://dj.renren.com/seostat?j={"from":"login_'+window.location.hostname+'","dev":"'+judge+'","page":"'+window.location.href+'"}');
console.log('dj!!');
})();
</script>
<!--<script src="http://s.xnimg.cn/a95444/nx/apps/login/login.js" type="text/javascript" ></script> -->
<script src="login.js" type="text/javascript" ></script>
<script type="text/javascript" src="http://s.xnimg.cn/a89789/js/adstats.js"></script>
<script type="text/javascript">
<!-- var oad1 = document.querySelector('.wwwad');
scrollReq(oad1); -->
</script>
<!--
<script>
define(['jquery'], function ($) {
var additionals = [
{ type: 'music', value: '音乐', image: 'http://a.xnimg.cn/nx/apps/login/cssimg/music.jpg', url: 'http://musics.renren.com/' },
{ type: 'game', value: '游戏', image: 'http://a.xnimg.cn/nx/apps/login/cssimg/game.jpg', url: 'http://renren-game.renren.com' }
]
var $intro = $('<div class="intro"></div>')
$intro
.on('mouseenter', '.item', function (e) {
$(this).addClass('active').removeClass('unactive')
})
.on('mouseleave', '.item', function (e) {
$(this).removeClass('active').addClass('unactive')
})
$.each(additionals, function (index, item) {
var $item = $('<div class="item unactive"></div>').css({ position: 'relative' })
$('<a class="' + item.type + ' content" href="' + item.url + '"></ a>')
.attr({ target: '_blank', hidefocus: true })
.css({ background: 'url(' + item.image + ') 0 0 no-repeat' })
.appendTo($item)
$intro.append($item)
})
$('.login-recommend').append($intro)
})
</script>
-->
</body>
<script src="music_ext.js" type="text/javascript" ></script>
</html>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
#1、获取所有tr标签
# trs = soup.find_all('div')
# # print(trs)
# for tr in trs:
# print('tr',tr)
# #获取第2个tr标签
# tr = soup.find_all('div',limit=2)[0]
# print(tr)
# #3、获取所有class等于even的tr标签
# trs = soup.find_all('tr',class_='even')#由于class是一个python中的类名,所以将class_表示所有class
# for tr in trs:
# print(tr)
#4、将所有id等于test,class也等于class的a标签提取出去
# aList = soup.find_all('a',id='test',class_='test')
# aList = soup.find_all('a',attrs={"id":"test","class":"test"})
# for a in aList:
# print(a)
#5、获取所有a标签的href属性
# aList = soup.find_all('a')
# for a in aList:
# #1、通过下标操作的方式
# # href = a['href']
# # print(href)
# #2、通过attrs属性的方式
# href = a.attrs['href']
# print(href)
#6、获取所有的纯文本信息
trs = soup.find_all('div')[1:]
for tr in trs:
# tds = tr.find_all("a")
# title = tds[0].string
# infos = tr.strings#出现空白字符
infos = tr.stripped_strings
for info in infos:
print(info)
'''
find :找出第一个标签
find_all:找出所有的标签
'''
'''
1、string:获取某个标签下的非标签字符串,返回来的是个字符串
2、strings:获取某个标签下的子孙非标签字符串,返回来的是个生成器
3、stripped_strings:获取某个标签下的子孙非标签字符串,会去掉空白字符串,返回来的是个生成器
4、get_text:获取某个标签下的子孙非标签字符串,不是以列表的形式返回,是以普通字符串返回
'''
爬虫2-lxml解析
lxml解析
from lxml import etree
text = '''
'''
def HTMl_file():
htmlElement = etree.HTML(text)
print(etree.tostring(htmlElement,encoding='utf-8').decoding('utf-8'))
def parse_lagou_file():
parse = etree.HTMLParser(encoding='utf-8')#定义解析器
htmlElement = etree.parse('./donw/baidu.html',parser=parse)
print(etree.tostring(htmlElement,encoding='utf-8').decoding('utf-8'))
if __name__ == '__main__':
parse_lagou_file()
爬虫实战3-电影天堂
电影天堂
#Author lpf
#usr/bin/src
from lxml import etree
import requests
BASE_DOMAIN = 'http://dytt8.net'
# url = 'https://dytt8.net/html/gndy/dyzz/list_23_1.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
def get_detail_urls(url):
response = requests.get(url,headers=headers)
#抓取下来的网页进行解码,然后存储到text属性中
#在这个网页中,因为编码方式,requests库猜错了,所以就会产生乱码
# text = response.content.decode('gbk')
text = response.text
html = etree.HTML(text)
# detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)
return detail_urls
# for detail_url in detail_urls:
# detail_url_data = BASE_DOMAIN + detail_url
# # print(BASE_DOMAIN+detail_url)
# response = requests.get(detail_url_data,headers=headers)
# text = response.content.decode('gbk')
# html = etree.HTML(text)
# # print(etree.tostring(html,encoding='utf-8').decode('utf-8'))
#
# break
def parse_detail_page(url):
movies = {}
response = requests.get(url,headers=headers)
text = response.content.decode('gbk')
html = etree.HTML(text)
title = html.xpath("//font[@color='#07519a']/text()")[0]
# print(title)
# title1 = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
# print(title1)
movies['title'] = title
zoomE = html.xpath("//div[@id='Zoom']")[0]
imgs = zoomE.xpath(".//img//@src")
cover = imgs[0]
print('imgs-----',imgs)
screenshot = imgs[1]
movies['cover'] = cover
movies['screenshot'] = screenshot
infos = zoomE.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith("◎年 代"):
info = info.replace("◎年 代","").strip()#strip()将一个字符串前后的空格都删除
# print(info)
movies['year'] = info
elif info.startswith("◎产 地"):
info = info.replace("◎产 地", "")
movies['country'] = info
elif info.startswith("◎主 演"):
info = info.replace("◎主 演","").strip()
# print(infos)
actors = [info]
for x in range(index+1,len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
# actors.append(actor)
movies['actors'] = actors
# print(imgs)
print(movies)
def spider():
base_url = 'https://dytt8.net/html/gndy/dyzz/list_23_{}.html'
for x in range(1,8):
url = base_url.format(x)
# print(url)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
# print(detail_url)
movies = parse_detail_page(detail_url)
if __name__ == '__main__':
spider()
爬虫实战4-豆瓣电影
豆瓣电影
#Author lpf
#usr/bin/src
import requests
from lxml import etree
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Referer':''
}
#1、将目标网站上的页面抓取下来
url = 'https://movie.douban.com/'
response = requests.get(url,headers=headers)
# print(response.text)
text= response.text
#response.text:返回一个经过解码后的字符串,是str(unicode)类型
#responce.context:返回的是一个原生的字符串,就是从网页上抓取下来的,没有经过处理的字符串,是bytes类型
#2、将抓取下来的数据根据一定的规则进行抓取
html = etree.HTML(text)
ul = html.xpath("//ul[@class='ui-slide-content']")[0]
# print(ul)
lis = ul.xpath("./li")
# lis = ul.xpath("./li")
movies = []
for li in lis:
print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
title = li.xpath("@data-title")
score = li.xpath("@data-rate")
duration = li.xpath("@data-duration")
region = li.xpath("@data-region")
director = li.xpath("@data-director")
thumbnail = li.xpath(".//img/@src")
movie = {
'title':title,
'score':score,
'duration':duration,
'region':region,
'director':director,
'thumbbail':thumbnail
}
movies.append(movie)
print(movies)
码字不易,关注小生吧!!!