# encoding:utf-8
import requests
from bs4 import BeautifulSoup
session = requests.Session()
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
"Accept": "*/*"}
url = "http://www.alexa.cn/siterank/"
req = session.get(url, headers=headers)
bsObj = BeautifulSoup(req.text, 'html.parser')
print bsObj
rankList = bsObj.findAll("div", {"class": "rank-index"})
linkList = bsObj.findAll("span", {"class": "domain-link"})
nameList = bsObj.findAll("div", {"class": "infos"})
lrank=[]
llink=[]
lname=[]
for rank in rankList:
# print rank.string
lrank.append(rank.string.encode('utf-8'))
# print lrank
for link in linkList:
llink.append(link.a['href'].encode('utf-8'))
# print link.a['href']
# print llink
for name in nameList:
str = name.contents[0].encode('utf-8')
s=str.split('(')
print s[0]
lname.append(s[0])
# print s[0]
print lname
wf = open('./spider.csv', 'w')
wf.write('rank,link,name\n')
for i in range(len(rankList)):
wf.write('%s,%s,%s\n' %(lrank[i],llink[i],lname[i]))
wf.close()
运行结果:spider.csv
rank,link,name
1,http://www.baidu.com,百度
2,http://www.qq.com,腾讯网
3,http://www.taobao.com,淘宝
4,http://www.tmall.com,天猫
5,http://www.sohu.com,搜狐网
6,http://www.jd.com,京东商城;京东多媒体网
7,http://www.sina.com.cn,新浪网
8,http://www.weibo.com,微博平台
9,http://www.360.cn,360安全中心
10,http://www.alipay.com,新支付宝
11,http://www.csdn.net,CSDN软件开发网
12,http://www.hao123.com,网址之家
13,http://www.so.com,360搜索
14,http://www.tianya.cn,天涯社区
15,http://www.soso.com,soso网
16,http://www.bing.com,必应搜索
17,http://www.youth.cn,中国青年网
18,http://www.xinhuanet.com,新华网
19,http://www.gmw.cn,光明网
20,http://www.zhihu.com,知乎