爬百度百科的词条
编写一个爬虫,爬百度百科“网络爬虫”的词条(链接:http://baike.baidu.com/view/284853.htm),将所有包含“view”的链接按下边格式打印出来。
锁定 –> http://baike.baidu.com/view/10812319.htm
网络爬虫 –> http://baike.baidu.com/view/284853.htm
蜘蛛 –> http://baike.baidu.com/subview/8483/5395928.htm
FOAF –> http://baike.baidu.com/view/271451.htm
import urllib.request
import re
from bs4 import BeautifulSoup
def main():
url="http://baike.baidu.com/view/284853.htm"
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
soup=BeautifulSoup(html,"html.parser")#使用python默认的解析器
for each in soup.find_all(href=re.compile("view")):
#print(each.text, "-->","http: // baike.baidu.com"+each["href"])
print (each.text,"-->","".join(["http://baike.baidu.com",each["href"]]))
#上边用join()不用+直接拼接,是因为join()被证明执行效率要高许多
if __name__=="__main__":
main()
深入:用户希望输入任意词条
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
def main():
word=input("请输入检索的关键词:")
keyword=urllib.parse.urlencode({"word":word})
url="https://baike.baidu.com/search/word?%s" %keyword
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
soup=BeautifulSoup(html,"html.parser")
for each in soup.find_all(href=re.compile("view")):
print (each.text,"-->","".join(["http://baike.baidu.com",each["href"]]))
if __name__ == "__main__":
main()
深入:添加副标题
用户输入搜索的关键词,然后爬虫进入每一条词条,然后检查是否有副标题,如果有,将副标题一并打印。
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
def main():
word=input("请输入检索的关键词:")
keyword=urllib.parse.urlencode({"word":word})
url="https://baike.baidu.com/search/word?%s" %keyword
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
soup=BeautifulSoup(html,"html.parser")
for each in soup.find_all(href=re.compile("view")):
# print (each.text,"-->","".join(["http://baike.baidu.com",each["href"]]))
content="".join([each.text])
url2="".join(["http://baike.baidu.com",each["href"]])
req2=urllib.request.Request(url2)
response2=urllib.request.urlopen(req2)
html2=response2.read().decode("utf-8")
soup2=BeautifulSoup(html2,"html.parser")
if soup2.h2:
content="".join([content,soup2.h2.text])
content="".join([content,"-->",url2])
print (content)
if __name__ == "__main__":
main()
深入:先打印10条链接
进一步深入,我们先打印10条链接,然后问用户“您往下看吗?”
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
def test_url(soup):
result=soup.find(text=re.compile("百度百科尚未收录词条"))
if result:
print(result[0:-1])
#百度这个bitch在最后加了一个“符号,去掉 --{百度百科尚未收录词条 “}
return False
else:
return True
def summary(soup):
# word=soup.h1.text
# #如果存在副标题,一起打印
# if soup.h2:
# word+=soup.h2.text
# #打印标题
# print (word)
# #打印简介
# if soup.find(class_="lemma-summary"):
# print(soup.find(class_="lemma-summary").text)
title_node = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1")
title = title_node.get_text()
if soup.h2:
title+=soup.h2.text
#打印标题
print (title)
# 根据页面的特征,获取摘要内容
summary_node = soup.find('div', class_="lemma-summary")
if summary_node is None:
summary = "None summary"
else:
summary = summary_node.get_text()
print (summary)
def get_urls(soup):
for each in soup.find_all(href=re.compile("view")):
content="".join([each.text])
url2="".join(["http://baike.baidu.com",each["href"]])
req2=urllib.request.Request(url2)
response2=urllib.request.urlopen(req2)
html2=response2.read().decode("utf-8")
soup2=BeautifulSoup(html2,"html.parser")
if soup2.h2:
content="".join([content,soup2.h2.text])
content="".join([content,"-->",url2])
yield content
def main():
word=input("请输入检索的关键词:")
keyword=urllib.parse.urlencode({"word":word})
url="https://baike.baidu.com/search/word?%s" %keyword
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
soup=BeautifulSoup(html,"html.parser")
if test_url(soup):
summary(soup)
print ("下边打印相关链接:")
each=get_urls(soup)
while True:
try:
for i in range(10):
print (next(each))
except StopIteration:
break
command=input("请输入任意字符继续打印,q退出程序:")
if command=="q":
break
else:
continue
if __name__ == "__main__":
main()
结果为: