爬虫学习之路

爬虫2

爬虫1-bs解析

‘’’
find :找出第一个标签
find_all:找出所有的标签
‘’’
‘’’
1、string:获取某个标签下的非标签字符串,返回来的是个字符串
2、strings:获取某个标签下的子孙非标签字符串,返回来的是个生成器
3、stripped_strings:获取某个标签下的子孙非标签字符串,会去掉空白字符串,返回来的是个生成器
4、get_text:获取某个标签下的子孙非标签字符串,不是以列表的形式返回,是以普通字符串返回
‘’’

#Author lpf
#usr/bin/src
html = """
<!doctype html><html class="nx-main980" >
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE8" />
<meta name="Description" content="人人网 校内是一个真实的社交网络,联络你和你周围的朋友。 加入人人网校内你可以:联络朋友,了解他们的最新动态;和朋友分享相片、音乐和电影;找到老同学,结识新朋友;用照片和日志记录生活,展示自我。" />
<meta name="Keywords" content="Xiaonei,Renren,校内,大学,同学,同事,白领,个人主页,博客,相册,群组,社区,交友,聊天,音乐,视频,校园,人人,人人网" />
<meta property="qc:admins" content="232517306762562566375" />
<meta property="wb:webmaster" content="f2fdc876b8ba2a5d" />
<meta name="msApplication-ID" content="App" />
<meta name="msApplication-PackageFamilyName" content="57722RenRenpreview.RenrenHD_fknrsfzqca1jw" /><link rel="shortcut icon" type="image/x-icon" href="http://a.xnimg.cn/favicon-rr.ico?ver=3" />
<link rel="apple-touch-icon" href="http://a.xnimg.cn/wap/apple_icon_.png" />
<script type="text/javascript">
XN = {get_check:'',get_check_x:'bc40edb7',env:{domain:'renren.com',shortSiteName:'人人',siteName:'人人网'}};
try {
window.onerror=function(){
var a=arguments,e=encodeURIComponent,l=location,d=new Date();
if(a.length!=3||a[2]==0)return 1;
new Image().src=['http://s.renren.com/speedstats/jserror/stats.php?message='+e(a[0]),
'url='+e(a[1]),'lineNo='+a[2],'location='+e(l),'time='+d.toLocaleTimeString()].join('&');
return 1;
};
} catch (e){};
</script><meta charset="utf-8"/>
<link rel="shortcut icon" type="image/x-icon" href="http://a.xnimg.cn/favicon-rr.ico?ver=3" />
<link rel="apple-touch-icon" href="http://a.xnimg.cn/wap/apple_icon_.png" />
<link rel="stylesheet" type="text/css" href="http://s.xnimg.cn/a86614/nx/core/base.css">
<script type="text/javascript">
if(typeof nx === 'undefined'){
var nx = {};
}
nx.log = {
startTime : + new Date()
};
nx.user = {
id : "",
ruid:"",
tinyPic	: " ",
name : "",
privacy: "",
requestToken : '',
_rtk : 'bc40edb7'
};nx.user.isvip = false;nx.user.hidead = false;nx.webpager = nx.webpager || {};
nx.production = true;
</script>
<script type="text/javascript" src="http://s.xnimg.cn/a83151/nx/core/libs.js"></script>
<script type="text/javascript">
define.config({map:{
"backbone":"http://s.xnimg.cn/a75208/nx/core/backbone.js",
"ui/draggable":"http://s.xnimg.cn/a70750/nx/core/ui/draggable.js",
"ui/menu":"http://s.xnimg.cn/a70736/nx/core/ui/menu.js",
"ui/resizable":"http://s.xnimg.cn/a70738/nx/core/ui/resizable.js",
"ui/sortable":"http://s.xnimg.cn/a70749/nx/core/ui/sortable.js",
"ui/tabs":"http://s.xnimg.cn/a78333/nx/core/ui/tabs.js",
"ui/ceiling":"http://s.xnimg.cn/a76297/nx/core/ui/ceiling.js",
"ui/columns":"http://s.xnimg.cn/a68070/nx/core/ui/columns.js",
"ui/dialog":"http://s.xnimg.cn/a76395/nx/core/ui/dialog.js",
"ui/fileupload":"http://s.xnimg.cn/a81310/nx/core/ui/fileupload.js",
"ui/pagination":"http://s.xnimg.cn/a70307/nx/core/ui/pagination.js",
"ui/placeholder":"http://s.xnimg.cn/a79685/nx/core/ui/placeholder.js",
"ui/progressbar":"http://s.xnimg.cn/a62964/nx/core/ui/progressbar.js",
"ui/rows":"http://s.xnimg.cn/a62964/nx/core/ui/rows.js",
"ui/scroll":"http://s.xnimg.cn/a61518/nx/core/ui/scroll.js",
"ui/scrollbar":"http://s.xnimg.cn/a76868/nx/core/ui/scrollbar.js",
"ui/select":"http://s.xnimg.cn/a82096/nx/core/ui/select.js",
"ui/slideshow":"http://s.xnimg.cn/a72804/nx/core/ui/slideshow.js",
"ui/speech":"http://s.xnimg.cn/a77631/nx/core/ui/speech.js",
"ui/textbox":"http://s.xnimg.cn/a79526/nx/core/ui/textbox.js",
"ui/renren/textbox":"http://s.xnimg.cn/a92727/nx/core/ui/renren/textbox.js",
"ui/tooltip":"http://s.xnimg.cn/a73228/nx/core/ui/tooltip.js",
"ui/renren/addfriend":"http://s.xnimg.cn/a78457/nx/core/ui/renren/addFriendLayer.js",
"ui/renren/at":"http://s.xnimg.cn/a78409/nx/core/ui/renren/atAndEmotion.js",
"ui/renren/emotion":"http://s.xnimg.cn/a78409/nx/core/ui/renren/atAndEmotion.js",
"ui/renren/commentCenter":"http://s.xnimg.cn/a83569/nx/core/ui/renren/commentCenter.js",
"ui/renren/friendgroup":"http://s.xnimg.cn/a62964/nx/core/ui/renren/friendGroup.js",
"ui/renren/friendListSelector":"http://s.xnimg.cn/a78513/nx/core/ui/renren/friendListSelector.js",
"ui/renren/like":"http://s.xnimg.cn/a83569/nx/core/ui/renren/like.js",
"nx/namecard":"http://s.xnimg.cn/a77668/nx/core/ui/renren/namecard.js",
"ui/renren/pagelayer":"http://s.xnimg.cn/a62964/nx/core/ui/renren/pageLayer.js",
"ui/renren/photoupload":"http://s.xnimg.cn/a82833/nx/core/ui/renren/photoupload.js",
"ui/renren/privacy":"http://s.xnimg.cn/a76680/nx/core/ui/renren/privacy.js",
"ui/renren/share":"http://s.xnimg.cn/a78999/nx/core/ui/renren/share.js",
"ui/renren/vocal":"http://s.xnimg.cn/a77347/nx/core/ui/renren/vocal.js",
"ui/renren/mvideo":"http://s.xnimg.cn/a80641/nx/core/ui/renren/mvideo.js",
"ui/renren/with":"http://s.xnimg.cn/a82994/nx/core/ui/renren/with.js",
"ui/clipboard":"http://s.xnimg.cn/a63417/nx/core/ui/clipboard.js",
"app/publisher":"http://s.xnimg.cn/a91505/nx/core/app/publisher.js",
"viewer":"http://s.xnimg.cn/a83025/nx/photo/viewer/js/viewer.js",
"media/player": "http://s.xnimg.cn/nx/photo/viewer/js/mediaplayer.js",
"ui/renren/like/commentseed":"http://s.xnimg.cn/a64636/nx/core/ui/renren/like.seed.comment.js",
"ui/renren/like/seed":"http://s.xnimg.cn/a62964/nx/core/ui/renren/like.seed.js",
"ui/renren/share/seed":"http://s.xnimg.cn/a62964/nx/core/ui/renren/share.seed.js",
"ui/renren/follow":"http://s.xnimg.cn/a78456/nx/core/ui/renren/follow.js",
"ui/renren/relationFollow":"http://s.xnimg.cn/a78457/nx/core/ui/renren/relationFollow.js",
"ui/autocomplete":"http://s.xnimg.cn/a70736/nx/core/ui/autocomplete.js",
"ui/showCommonFriend":"http://s.xnimg.cn/a78917/nx/core/ui/renren/showcommonfriend.js",
"photo/circler":"http://s.xnimg.cn/a73344/nx/photo/phototerminal/js/circler.js",
"ui/friendSearch":"http://s.xnimg.cn/a64338/nx/core/ui/renren/friendSearch.js",
"ui/renren/replyOption":"http://s.xnimg.cn/a68256/nx/core/ui/renren/replyOption.js",
"photo/avatarUpload": "http://s.xnimg.cn/a77340/nx/photo/upload-avata/js/avatarUpload.js",
"ui/renren/school":"http://s.xnimg.cn/a85689/nx/core/ui/renren/school.js"
}});
nx.data.isDoubleFeed = Boolean();
nx.data.isDoubleFeedGuide = Boolean();
</script>
<script type="text/javascript" src="http://s.xnimg.cn/a95943/nx/core/base.js"></script>
<!--[if lt IE 9]>
<script type="text/javascript">
document.execCommand("BackgroundImageCache", false, true);
</script>
<![endif]-->
<script type="text/javascript">nx.webpager.disable = true;</script>
<link href="login.css" rel="stylesheet" type="text/css" media="all" />		

<title>人人网,中国领先的实名制SNS社交网络。加入人人网,找到老同学,结识新朋友。</title><script src="http://s.xnimg.cn/a72842/n/core/base-all2.js" type="text/javascript"></script>
</head>
<body id="syshome" class="login">
<div id="header">
<div id="navBar" class="site-nav rr">
<div class="navigation-wrapper">
<div class="navigation navigation-new clearfix">
<div id="logo2">
<h1>
<a href="http://www.renren.com" title="人人网 renren.com - 人人网校内是一个真实的社交网络,联系朋友,一起玩游戏">
<img alt="人人网 renren.com - 人人网校内是一个真实的社交网络,联系朋友,一起玩游戏" title="人人网 renren.com - 人人网校内是一个真实的社交网络,联系朋友,一起玩游戏" src="http://a.xnimg.cn/nx/apps/login/cssimg/logo-big.jpg" />
</a>
</h1>
</div>
<div class="nav-body clearfix">
<div class="nav-other">
<div class="menu">
<a href="http://st.renren.com" target="_blank" class="st-btn">学生团体申请入口</a>
</div>
<div class="menu">
<a id="reg_link" title="注册" stats="homenav_reg" href="http://wwv.renren.com/xn.do?ss=10131&rt=1&g=v6reg">注册</a>
</div>
<div class="menu">
<a title="给我们提建议" stats="homenav_suggest" href="http://support.renren.com/link/suggest">反馈意见</a>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<div id="opi" class="page-wrapper clearfix">
<div class="full-page-holder">
<div class="full-page"><div class="login-page clearfix login-wrap">
<div class="side-column login-box">
<div class="login-panel ">
<div class="radiusimg">
<div class="shadow"></div>
<div class="pic"><img src="" id="personhead"/></div>
<!-- <img src="http://a.xnimg.cn/nx/apps/login/cssimg/person.jpg"> -->
</div>
<span id="errorMessage" class="errors_div" style="display:none;"></span>
<div class="yellow-error" id="yellow_error">
<a class="close" href="javascript:closeError();"></a>
<p class="wrong">您的用户名和密码不匹配</p>
<p class="worp">为了账号安全,已向您的邮箱: <strong id="sendemail"></strong>发送了一封确认信,请通过邮件内链接登录。</p>
<p class="m-26"><a id="gotoEmail" href="#" target="_blank">打开邮箱查收确认信</a></p>
<p class="m-26"><a href="javascript:closeError();">重新输入</a></p>
</div>
<div class="yellow-error" id="account_stop">
<a class="close" href="javascript:closeStop();"></a>
<p class="wrong"></p>
<p class="center">您的账号已停止使用,如有疑问请联系<a style="background:none;padding-left:0;" target="_blank" href="http://help.renren.com/#http://help.renren.com/support/contomvice?pid=2&selection={couId:193,proId:342,cityId:1000375}">客服</a></p>
</div>
<div class="yellow-error" id="account_lock">
<a class="close" href="javascript:closeLock();"></a>
<p class="wrong">您的账号由于以下某种原因需要解锁才能登录</p>
<ol>
<li>删除过账号</li>
<li>长时间没有登录网站</li>
<li>安全原因</li>
</ol>
<p class="center"><a href="http://safe.renren.com/relive.do">立即解锁</a></p>
</div>
<form method="post" id="loginForm" class="login-form" action="http://www.renren.com/PLogin.do">
<dl class="top clearfix">
<dd>
<input type="text" name="email" class="input-text" id="email" tabindex="1" value="" />
</dd>
</dl>
<dl class="pwd clearfix">
<dd>
<input type="password" id="password" name="password" error="请输入密码" class="input-text" tabindex="2"/>
<label class="pwdtip" id="pwdTip" for="password">请输入密码</label>
<a class="forgetPwd" id="forgetPwd" href="http://safe.renren.com/findPass.do" stats="home_findpassword">忘记密码?</a>
</dd>
</dl>
<div class="caps-lock-tips" id="capsLockMessage" style="display:none"></div>
<dl class="savepassword clearfix">
<dt>
<label title="为了确保您的信息安全,请不要在网吧或者公共机房勾选此项!" for="autoLogin" class="labelCheckbox">
<input type="checkbox" name="autoLogin" id="autoLogin" value="true" tabindex="4" />下次自动登录
</label>
</dt>
<dd>
<span class="getpassword" id="getpassword"><a href="http://safe.renren.com/findPass.do" stats="home_findpassword">忘记密码?</a></span>
</dd>
</dl>
<dl id="code" class="code clearfix">
<dt><label for="code">验证码:</label></dt>
<dd>
<input id="icode" type="text" name="icode" class="input-text" tabindex="3" autocomplete="off" />
<label class="codetip" id="codeTip" for="icode">请输入验证码</label>
</dd>
</dl>
<dl id="codeimg" class="codeimg clearfix">
<dt></dt>
<dd><img id="verifyPic_login" src="http://icode.renren.com/getcode.do?t=web_login&rnd=Math.random()"/>
</dd>
<a class="changeone" href="javascript:refreshCode_login();" >换一个</a>
</dl>
<dl class="bottom">
<input type="hidden" name="origURL" value="http://www.renren.com/880151247/profile" />
<input type="hidden" name="domain" value="renren.com" />
<input type="hidden" name="key_id" value="1" />
<input type="hidden" name="captcha_type" id="captcha_type" value="web_login" />
<input type="submit" id="login" class="input-submit login-btn" stats="loginPage_login_button" value="登录" tabindex="5"/>
</dl>
</form>
<div class="regnow">
<input type="button" onclick="window.location='http://wwv.renren.com/xn.do?ss=10131&rt=1&g=v6reg'" id="regnow"  class="input-button login-btn regbutton" value="注册" tabindex="6" stats="loginPage_signUp_button" />
</div>
<p class="third-party-title"><span class="underscore left"></span><span class="text">第三方登录</span><span class="underscore right"></span></p>
<div class="login_corp" >
<div class="Third-partyi-login">
	<a title="微信" class="login-item weixin" href="http://www.renren.com/api/jump?src=wx" id="login_weixin" stats="loginPage_weixin_link"></a>
	 <a title="QQ" class="login-item qq" href="http://www.renren.com/api/jump?src=qq" id="login_qq" stats="loginPage_qq_link"></a>
	 <a title="微博" class="login-item weibo" href="http://www.renren.com/api/jump?src=wb" id="login_weibo" stats="loginPage_weibo_link"></a>
</div>
</div>
<div class="other-login clearfix">
<div class="login-word login-item">其它账号登录:</div>
<a  title="移动" class="login-item yidong" href="https://open.mmarket.com:443/omee-aus/services/oauth/authorize?responseType=code&scope=getUserInfo&clientId=300007884008&redirectUri=http%3A%2F%2Fwww.renren.com%2Fbind%2Fcnmobile%2FloginCallBack&clientState=9" id="login_cnmobile" stats="loginPage_baidu_link"></a>
<a title="天翼" class="login-item tianyi" id="login_tianyi" href="https://oauth.api.189.cn/emp/oauth2/authorize?app_id=296961050000000294&response_type=code&redirect_uri=http://www.renren.com/bind/ty/tyLoginCallBack" stats="loginPage_tianyi_link"></a>
<a title="360" class="login-item lo360" id="login_360" href="https://openapi.360.cn/oauth2/authorize?client_id=5ddda4458747126a583c5d58716bab4c&response_type=code&redirect_uri=http://www.renren.com/bind/tsz/tszLoginCallBack&scope=basic&display=default" stats="loginPage_360_link"></a>
 <a title="百度" class="login-item baidu" href="https://openapi.baidu.com/oauth/2.0/authorize?response_type=code&client_id=foRRWjPq8In3SIhmKQw1Pep3&redirect_uri=http%3A%2F%2Fwww.renren.com%2Fbind%2Fbaidu%2FbaiduLoginCallBack" id="login_baidu" stats="loginPage_baidu_link"></a>
</div>
</div>
</div>
<div class="main-column">
<div id="mainRecommend" class="main-recommend">
<div id="ad100000000061" data-pv="h01" class="wwwad"></div>
<!--<script>
load_jebe_ads(1)
</script>-->
<div class="login-recommend clearfix">
<div class="intro">
<div class="item">
<a class="qrcode content" href="#nogo" target="_blank"></a>
</div>
<div class="item">
<a class="phone content" href="http://zhibo.renren.com/client" target="_blank"></a>
</div>
<div class="item">
<!--<a class="pad content" href="http://2014.renren.com/ipad" target="_blank"></a> -->
<a class="pad content" href="http://down.renren.com/pczbzs/rrzb_Setup-13.exe" target="_blank"> </a>
</div>
<div class="item">
<a class="other content" href="http://2014.renren.com/" target="_blank"></a>
</div>
    <div class="item">
        <a class="music content" href="http://musics.renren.com/" target="_blank"> </a>
            </div>
                <div class="item">
                    <a class="game content" href="http://renren-game.renren.com" target="_blank"> </a>
                   </div>
</div>
</div>
</div>
</div>
</div></div>
</div>
</div> 





 

<div class="ft-wrapper clearfix">

	<p>

		<strong>玩转人人</strong>

		<a href="http://page.renren.com/register/regGuide/" target="_blank">公共主页</a>

		<a href="http://zhibo.renren.com" target="_blank">美女直播</a>

		<a href="http://support.renren.com/helpcenter" target="_blank">客服帮助</a>

		<a href="http://www.renren.com/siteinfo/privacy" target="_blank">隐私</a>

	</p>

	<p>

		<strong>商务合作</strong>

		<a href="http://page.renren.com/marketing/index" target="_blank">品牌营销</a>

		<a href="http://bolt.jebe.renren.com/introduce.htm" class="l-2" target="_blank">中小企业<br />自助广告</a>

		<a href="http://dev.renren.com/" target="_blank">开放平台</a>

        <a href="http://dsp.renren.com/dsp/index.htm" target="_blank">人人DSP</a>

    </p>

	<p>

		<strong>公司信息</strong>

		<!--<a href="http://www.renren-inc.com/zh/product/renren.html" target="_blank">关于我们</a>-->

		<a href="http://www.donews.com/commom/aboutus" target="_blank">关于我们</a>

		<a href="http://page.renren.com/gongyi" target="_blank">人人公益</a>

		<a href="http://www.renren-inc.com/zh/hr/" target="_blank">招聘</a>

		<a href="#nogo" id="lawInfo">法律声明</a>

		<!--<a href="http://s.xnimg.cn/a92221/wap/mobile/Reporting/index.html" target="_blank">举报流程</a>-->

		<a href="http://s.xnimg.cn/a95941/wap/mobile/Reporting/index.html" target="_blank">举报流程</a>

	</p>

	<p>

		<strong>友情链接</strong>

		<a href="http://fenqi.renren.com/" target="_blank">人人分期</a>

		<a href="https://licai.renren.com/" target="_blank">人人理财</a>

		<a href="http://www.woxiu.com/" target="_blank">我秀</a>

		<a href="http://zhibo.renren.com/" target="_blank">人人直播</a>	

		<a href="http://www.renren.com/" target="_blank">人人网</a>

		<a href="https://www.kaixin.com" target="_blank">开心汽车</a>	

	</p>

	<p>

		<strong>人人移动客户端下载</strong>

		<a href="http://mobile.renren.com/showClient?v=platform_rr&psf=42064" target="_blank">iPhone/Android</a>

		<a href="http://mobile.renren.com/showClient?v=platform_hd&psf=42067" target="_blank">iPad客户端</a>

		<a href="http://mobile.renren.com" target="_blank">其他人人产品</a>

	</p>

	<!--<p class="copyright-info">-->

	<!-- 临时添加公司信息用 -->

	<p class="copyright-info" style="margi-left: -20px">

		<span>公司全称:北京斗牛士文化传媒有限公司</span>

		<span>公司电话:010-60845018</span>

        <span><a href="mailto:kefu@renren.com">公司邮箱:kefu@renren.com</a></span>

		<span>公司地址:北京市海淀区宝盛东路多牛传媒中心</span>

		<span>违法和不良信息举报电话:024-31160919</span>

		<span><a href="http://jb.ccm.gov.cn/" target="_blank">12318全国文化市场举报网站</a></span>

		<span><a target="_blank" href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=11010802029038">京公网安备 11010802029038号</a></span>

		<span><a href="http://report.12377.cn:13225/toreportinputNormal_anis.do" target="_blank"><img style="height: 22px;float: left; margin-left: 78px;" src="http://s.xnimg.cn/imgpro/civilization/jubaologoNew.png">网上有害信息举报中心</a></span>

		<span><img id="wenhuajingying_icon" style="height: 28px;float: left; margin-left: 60px;" src="http://s.xnimg.cn/imgpro/civilization/wenhuajingying.png"/><a href="http://s.xnimg.cn/imgpro/xkz.png" target="_blank">京网文[2018]2361-237号</a></span>

		<span><a href="http://s.xnimg.cn/imgpro/icp.png" target="_blank">京ICP证1510088号</a></span>

		<span>人人网&copy;2016</span>

		<span><img src="http://a.xnimg.cn/imgpro/black-logo.png" style="vertical-align: text-top;"></span>



	</p>

</div>

<!-- dj埋码 -->

<script type="text/javascript">

function sendStats(url){var n="log_"+(new Date).getTime();var c=window[n]=new Image;c.onload=c.onerror=function(){window[n]=null};c.src=url;c=null}

function goPAGE() {

var currentUrl = window.location.href.split('#')[0];

if ((navigator.userAgent.match(/(phone|pad|pod|iPhone|iPod|ios|Android|Mobile|BlackBerry|IEMobile|MQQBrowser|JUC|Fennec|wOSBrowser|BrowserNG|WebOS|Symbian|Windows Phone)/i))) {

    return "wap";

}

else {

     return "pc";

 }

}

var judge = goPAGE();

    (function(){

        sendStats('http://dj.renren.com/seostat?j={"from":"login_'+window.location.hostname+'","dev":"'+judge+'","page":"'+window.location.href+'"}');

        console.log('dj!!');

    })();

</script>


<!--<script src="http://s.xnimg.cn/a95444/nx/apps/login/login.js" type="text/javascript" ></script> -->
<script src="login.js" type="text/javascript" ></script>
<script type="text/javascript" src="http://s.xnimg.cn/a89789/js/adstats.js"></script>
    <script type="text/javascript">  
<!--        var oad1 = document.querySelector('.wwwad');
        scrollReq(oad1); -->

            
    </script>
    <!--
    <script>
    define(['jquery'], function ($) {
          var additionals = [ 
              { type: 'music', value: '音乐', image: 'http://a.xnimg.cn/nx/apps/login/cssimg/music.jpg', url: 'http://musics.renren.com/' },
                  { type: 'game', value: '游戏', image: 'http://a.xnimg.cn/nx/apps/login/cssimg/game.jpg', url: 'http://renren-game.renren.com' }
                     ]

                        var $intro = $('<div class="intro"></div>')
                           $intro
                               .on('mouseenter', '.item', function (e) {
                                           $(this).addClass('active').removeClass('unactive')
                                               })
                                   .on('mouseleave', '.item', function (e) {
                                               $(this).removeClass('active').addClass('unactive')
                                                   })
                                      
                                         $.each(additionals, function (index, item) {
                                                   var $item =  $('<div class="item unactive"></div>').css({ position: 'relative' })
                                                         $('<a class="' + item.type + ' content" href="' + item.url + '"></ a>')
                                                                 .attr({ target: '_blank', hidefocus: true  })
                                                                         .css({ background: 'url(' + item.image + ') 0 0 no-repeat' })
                                                                                 .appendTo($item)
                                                                                       $intro.append($item)
                                                                                          })

                                            $('.login-recommend').append($intro)
    })
    </script>
    -->
</body>
<script src="music_ext.js" type="text/javascript" ></script>
</html>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')

#1、获取所有tr标签
# trs = soup.find_all('div')
# # print(trs)
# for tr in trs:
#     print('tr',tr)

# #获取第2个tr标签
# tr = soup.find_all('div',limit=2)[0]
# print(tr)

# #3、获取所有class等于even的tr标签
# trs = soup.find_all('tr',class_='even')#由于class是一个python中的类名,所以将class_表示所有class
# for tr in trs:
#     print(tr)

#4、将所有id等于test,class也等于class的a标签提取出去
# aList = soup.find_all('a',id='test',class_='test')
# aList = soup.find_all('a',attrs={"id":"test","class":"test"})
# for a in aList:
#     print(a)

#5、获取所有a标签的href属性
# aList = soup.find_all('a')
# for a in aList:
#     #1、通过下标操作的方式
#     # href = a['href']
#     # print(href)
#     #2、通过attrs属性的方式
#     href = a.attrs['href']
#     print(href)

#6、获取所有的纯文本信息
trs = soup.find_all('div')[1:]
for tr in trs:
#     tds = tr.find_all("a")
#     title = tds[0].string
#     infos = tr.strings#出现空白字符
    infos = tr.stripped_strings
    for info in infos:
        print(info)
'''
find :找出第一个标签
find_all:找出所有的标签
'''
'''
1、string:获取某个标签下的非标签字符串,返回来的是个字符串
2、strings:获取某个标签下的子孙非标签字符串,返回来的是个生成器
3、stripped_strings:获取某个标签下的子孙非标签字符串,会去掉空白字符串,返回来的是个生成器
4、get_text:获取某个标签下的子孙非标签字符串,不是以列表的形式返回,是以普通字符串返回
'''

爬虫2-lxml解析

lxml解析

from lxml import etree
text = '''
'''
def HTMl_file():
    htmlElement = etree.HTML(text)
    print(etree.tostring(htmlElement,encoding='utf-8').decoding('utf-8'))

def parse_lagou_file():
    parse = etree.HTMLParser(encoding='utf-8')#定义解析器
    htmlElement = etree.parse('./donw/baidu.html',parser=parse)
    print(etree.tostring(htmlElement,encoding='utf-8').decoding('utf-8'))
if __name__ == '__main__':
    parse_lagou_file()

爬虫实战3-电影天堂

电影天堂

#Author lpf
#usr/bin/src
from lxml import etree
import requests

BASE_DOMAIN = 'http://dytt8.net'

# url = 'https://dytt8.net/html/gndy/dyzz/list_23_1.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
def get_detail_urls(url):
    response = requests.get(url,headers=headers)
    #抓取下来的网页进行解码,然后存储到text属性中
    #在这个网页中,因为编码方式,requests库猜错了,所以就会产生乱码
    # text = response.content.decode('gbk')
    text = response.text
    html = etree.HTML(text)

    # detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)
    return detail_urls
    # for detail_url in detail_urls:
    #     detail_url_data = BASE_DOMAIN + detail_url
    #     # print(BASE_DOMAIN+detail_url)
    #     response = requests.get(detail_url_data,headers=headers)
    #     text = response.content.decode('gbk')
    #     html = etree.HTML(text)
    #     # print(etree.tostring(html,encoding='utf-8').decode('utf-8'))
    #
    #     break
def parse_detail_page(url):
    movies = {}
    response = requests.get(url,headers=headers)
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//font[@color='#07519a']/text()")[0]
    # print(title)
    # title1 = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    # print(title1)
    movies['title'] = title

    zoomE = html.xpath("//div[@id='Zoom']")[0]
    imgs = zoomE.xpath(".//img//@src")
    cover = imgs[0]
    print('imgs-----',imgs)

    screenshot = imgs[1]
    movies['cover'] = cover
    movies['screenshot'] = screenshot
    infos = zoomE.xpath(".//text()")
    for index,info in enumerate(infos):
        if info.startswith("◎年  代"):
            info = info.replace("◎年  代","").strip()#strip()将一个字符串前后的空格都删除
            # print(info)
            movies['year'] = info
        elif info.startswith("◎产  地"):
            info = info.replace("◎产  地", "")
            movies['country'] = info
        elif info.startswith("◎主  演"):
            info = info.replace("◎主  演","").strip()
            # print(infos)
            actors = [info]
            for x in range(index+1,len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎"):
                    break
                # actors.append(actor)
                movies['actors'] = actors
    # print(imgs)
    print(movies)



def spider():
    base_url = 'https://dytt8.net/html/gndy/dyzz/list_23_{}.html'
    for x in range(1,8):
        url = base_url.format(x)
        # print(url)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            # print(detail_url)
            movies = parse_detail_page(detail_url)

if __name__ == '__main__':
    spider()

爬虫实战4-豆瓣电影

豆瓣电影

#Author lpf
#usr/bin/src
import requests
from lxml import etree
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    'Referer':''
}
#1、将目标网站上的页面抓取下来
url = 'https://movie.douban.com/'
response = requests.get(url,headers=headers)
# print(response.text)
text= response.text

#response.text:返回一个经过解码后的字符串,是str(unicode)类型
#responce.context:返回的是一个原生的字符串,就是从网页上抓取下来的,没有经过处理的字符串,是bytes类型

#2、将抓取下来的数据根据一定的规则进行抓取
html = etree.HTML(text)
ul = html.xpath("//ul[@class='ui-slide-content']")[0]
# print(ul)
lis = ul.xpath("./li")
# lis = ul.xpath("./li")
movies = []
for li in lis:
    print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
    title = li.xpath("@data-title")
    score = li.xpath("@data-rate")
    duration = li.xpath("@data-duration")
    region = li.xpath("@data-region")
    director = li.xpath("@data-director")
    thumbnail = li.xpath(".//img/@src")

    movie = {
        'title':title,
        'score':score,
        'duration':duration,
        'region':region,
        'director':director,
        'thumbbail':thumbnail
    }
    movies.append(movie)
print(movies)


码字不易,关注小生吧!!!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值