Python爬虫百度搜索标题

最新推荐文章于 2024-07-29 21:31:04 发布

疯狂的编程者

最新推荐文章于 2024-07-29 21:31:04 发布

阅读量5k

点赞数 3

分类专栏： python爬虫

本文链接：https://blog.csdn.net/wuzhilong277832060/article/details/80310450

版权

python爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

from urllib import request,parse
import urllib
import http.cookiejar
from bs4 import BeautifulSoup

word=input("请输入搜索的关键词:");
url="http://www.baidu.com/s?wd="+urllib.parse.quote(word);

headers={"Accept": "text/html, application/xhtml+xml, image/jxr, */*",

         "Accept - Encoding": "gzip, deflate, br",

         "Accept - Language": "zh - CN",

         "Connection": "Keep - Alive",
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299",
         "referer":"baidu.com"};
cjar = http.cookiejar.CookieJar();
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar));

headall=[];
for key,value in headers.items():
    item=(key,value);
    headall.append(item);
opener.addheaders=headall;
urllib.request.install_opener(opener);
data =urllib.request.urlopen(url).read().decode('utf-8');
soup=BeautifulSoup(data,'html.parser');
# 以格式化的形式打印html
#print(soup.prettify());
for result_table in soup.find_all('h3',class_='t'):

    a_click = result_table.find("a");

    print( "-----标题----\n" + a_click.get_text())  # 标题
    print("----链接----\n" + str(a_click.get("href")))  # 链接