python 必应搜索教程

zaiecunmin

已于 2023-01-14 23:15:39 修改

阅读量2.5k

点赞数 8

文章标签： python 爬虫数据挖掘

于 2022-07-16 13:16:45 首次发布

本文链接：https://blog.csdn.net/m0_70451198/article/details/125794155

版权

必应搜索：
使用库：requests;bs4
目标：

分析必应的url
获取信息，解析为BeautifulSoup对象
提取信息，输出信息
源代码放在最后

1.分析URL：

https://cn.bing.com/search?#搜索URL
q=000#搜索内容
form=ANNTH1#国内版ANNTH1、国际版BESBTB
......#其余参数为特殊的验证码，不影响结果

建立类 bi:

import requests
from bs4 import BeautifulSoup
class bi:
    def make_url(self,q):
        return 'https://cn.bing.com/search?q=' + q

2.获取信息：
右键检查，我们发现所有信息都位于第一个’ol’标签中。
必应HTTPS的架构

class bi:
    def find(self):
        u = self.make_url(q)#构造URL
        r = requests.get(u)#发送GET请求获取信息
        soup = BeautifulSoup(r.text, 'html5lib')
        #用BeautifulSoup,html5lib进行解析，html5lib需要下载(不需要导入)
        #pip install html5lib
        all_ = soup.find('ol')#第一个<ol>
        a=all_.find_all('li')#<ol>下所有<li>存储到列表
        return a

3.提取信息：
循环列表提取信息

li架构

介绍文本

标题文本

class bi:
    def get(self,q):
        a=self.find()
        things = []
        for tag in a:
            try:
                item1 = tag.find(class_='b_title')
                url = item1.find('h2').find('a')['href']
                #页面的网址
                icon = item1.find('a').find('img')['src']
                # 页面的图标
                title = item1.text#只需要其中的文本内容
                item2=tag.find(class_='b_caption')
                text=item2.find('p').text
                #介绍内容
                thing = {'title': title, 'url': url, 'text':text, icon: 'icon'}
                #把信息存入字典
                things.append(thing)
                #把字典加入列表
            except:
                pass
                #有的信息架构特殊，使程序不报错
        return things

4.old源代码：

import requests
from bs4 import BeautifulSoup
class bi:
    def make_url(self,q):
        return 'https://cn.bing.com/search?q=' + q
    def find(self):
        u = self.make_url(q)#构造URL
        r = requests.get(u)#发送GET请求获取信息
        soup = BeautifulSoup(r.text, 'html5lib')
        #用BeautifulSoup,html5lib进行解析，html5lib需要下载(不需要导入)
        #pip install html5lib
        all_ = soup.find('ol')#第一个<ol>
        a=all_.find_all('li')#<ol>下所有<li>存储到列表
        return a
    def get(self,q):
        a=self.find()
        things = []
        for tag in a:
            try:
                item1 = tag.find(class_='b_title')
                url = item1.find('h2').find('a')['href']
                #页面的网址
                icon = item1.find('a').find('img')['src']
                # 页面的图标
                title = item1.text#只需要其中的文本内容
                item2=tag.find(class_='b_caption')
                text=item2.find('p').text
                #介绍内容
                thing = {'title': title, 'url': url, 'text':text, icon: 'icon'}
                #把信息存入字典
                things.append(thing)
                #把字典加入列表
            except:
                pass
                #有的信息架构特殊，使程序不报错
        return things

if __name__=='__main__':
    n=bi()
    a=input('Bing\n必应搜索：')
    for i in n.get(a):
        print(i['title'])
        print(i['url'])
        print(i['text'])
        print('_'*100+'\n')

5.new源代码

import requests
from bs4 import BeautifulSoup
class bing:
    def __init__(self):
        '''
        bing请求类
        
        Dict:
        header
        
        Function:
        __init__(self)
        make_url(self,q,data='chinese',pagenum='1')
        jx(self,url)

        Returns
        -------
        None.

        '''
        self.header = {
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76'
            }


    def make_url(self,q,data='chinese',pagenum='1'):
        '''
        生成必应url
        
        Parameters
        ----------
        q : str
            关键字
            data : str, optional
            'chinese' or 'english'. The default is 'chinese'.
            pagenum : str, optional
            The default is '1'.
            
            Returns
            -------
            url.

        '''
        if data != 'chinese' and data != 'english':
            assert data != 'chinese' and data != 'english', '不支持中英文以外的文字'
    
        if data == 'chinese':
            urls = 'https://cn.bing.com/search?q='+q+'&form=ANSPH1'
        elif data == 'english':
            urls = 'https://cn.bing.com/search?q='+q+'&FORM=BESBTB&ensearch=1'
                
        if pagenum == '1':
            return urls
        r = requests.get(urls,headers = self.header)
        s = BeautifulSoup(r.text,'html5lib')
        page = s.find('li',class_ = 'b_pag')
        pages = page.find_all('li')
        pageurls = {}
        for i in pages:
            try:
                pageurls[i.text] = i.find('a')['href']
            except:
                pass
    
        try:
            return pageurls[pagenum]
        except:
            return urls
    
    

    def jx(self,url):
        '''
        请求并解析url

        Parameters
        ----------
        url : str
            usually from function make_url.

        Returns
        -------
        None.

        '''
        r = requests.get(url,headers = self.header)
        s = BeautifulSoup(r.text,'html5lib')
        lis = s.find_all('li',class_ = 'b_algo')
        rlist = []
        # data = {'title':'','js':'','url':'','icon_url':''}
        
        for i in lis:
            data = {}
            
            data['title'] = i.find('div',class_ = 'b_title').find('h2').text
            data['url'] = i.find('div',class_ = 'b_title').find('h2').find('a')['href']
            try:
                data['js'] = i.find('p').text # .find('div',class_ = 'b_caption')
            except:
                data['js'] = 'no js'
                
            try:
                data['icon_url'] = i.find('div',class_ = 'b_title').find('a').find('div',class_ = 'rms_iac')['data-src']
            except:
                data['icon_url'] = 'no icon'
            rlist.append(data)
        return rlist
    
if __name__ == '__main__':         
    k = bing()
    url = k.make_url('bs4')
    rlist = k.jx(url)
    for i in rlist:
        for j in i:
            print(i[j])
        print('\n')