python 实现百度关键字自动爬虫

最新推荐文章于 2022-07-11 17:40:04 发布

Listest

最新推荐文章于 2022-07-11 17:40:04 发布

阅读量1.4k

点赞数 1

分类专栏： python 文章标签： python 百度爬虫

本文链接：https://blog.csdn.net/Listest/article/details/121158138

版权

python 专栏收录该内容

7 篇文章 2 订阅

订阅专栏

# coding: utf-8

# In[3]:

import requests
from lxml import etree
import re
from sqlalchemy import create_engine


engine = create_engine('sqlite:///xskh_pachong.sqlite',echo=True)

from sqlalchemy import Table,Column,Integer,String,ForeignKey
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()
DBSession = sessionmaker(bind=engine)
session = DBSession()
class xskh_pachong(Base):
    __tablename__="xskh_pachong"
    id = Column(Integer,primary_key=True)
    aname = Column(String(128))
    telephone = Column(String(126))
    info = Column(String(400))
    http = Column(String(128))
    
    @classmethod
    def save(cls,data):
        session.add(data)
        session.commit()
        return data.id
    

Base.metadata.create_all(engine)  


import requests
from lxml import etree
import re
from sqlalchemy import create_engine
def get_aname_and_telephone(url,company_name,telephone):
    if url == '':
        return
    try:
        response = requests.get(url)
    except:
        return
    try:
        content = response.content.decode('gbk','strict')
    except:
        content = response.content.decode('utf-8','strict')
    
    html = etree.HTML(content)

    #..//html/body//span//text()
    a = html.xpath('''..//html/body//span//text()''')
    pattern_mob = re.compile('1[3|4|5|7|8]\d{9}')
    telephone.append(pattern_mob.findall("".join(a)))

    #..//html/body//a//text()
    b =  html.xpath('''..//html/body//text()''')
    pattern_company = re.compile('[\u4e00-\u9fa5]{5,23}[司厂]{1}')
    company_name.append(pattern_company.findall("".join(b)))



import requests
from lxml import etree
import re
from sqlalchemy import create_engine
def get_by_word(wd):
    
    headersParameters = {    #发送HTTP请求时的HEAD信息，用于伪装为浏览器
            'Connection': 'Keep-Alive',
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
        }

    # pn = 换页 wd 是关键字
#     wd = '佛山 装饰 联系人'
    pn = 0;
    baidu_url = 'https://www.baidu.com'
    houzui = '/s?wd={}&pn={}'
    url = baidu_url+houzui.format(wd,pn)

    response = requests.get(url, timeout=60,headers = headersParameters)
    count = 1
    while True:  
        content = response.content.decode() 
        html = etree.HTML(content)
        #//div[@id="content_left"]/div[@id="1"]/h3//text()  
        #//div[@id="content_left"]/div[@id="1"]//div[@class="c-abstract"]//text()
        #//div[@id="content_left"]/div[@id="10"]//h3//text()
        #//div[@id="content_left"]/div/@id

        items = html.xpath('''//div[@id="content_left"]/div/@id''')
        for i in items:
            a = html.xpath('''//div[@id="content_left"]/div[@id="{}"]//h3//text()'''.format(i))
            b = html.xpath('''//div[@id="content_left"]/div[@id="{}"]//div[@class="c-abstract"]//text()'''.format(i))
            c = html.xpath('''//div[@id="content_left"]/div[@id="{}"]/h3/a/@href'''.format(i))

            aname = "".join(a)
            content =aname+' '+ "".join(b)
            http = "".join(c)


            pattern_mob = re.compile('1[3|4|5|7|8]\d{9}')
            telephone = pattern_mob.findall(content)
            pattern_company = re.compile('[\u4e00-\u9fa5]{5,23}[司厂]{1}')
            company_name = pattern_company.findall(content)

            if "".join(company_name)!="":
                aname = company_name[0]
            else:
                pass
                try:
                    get_aname_and_telephone(http,company_name,telephone)
                    company_name = company_name[0]
                    telephone = telephone[0]
                    aname = companay_name[0]
                except:
                    aname = ''


            if "".join(telephone)!="":
                pass
            else:
                pass
                print(http,company_name,telephone)
                try:
                   get_aname_and_telephone(http,company_name,telephone)
                   company_name = company_name[0]
                   telephone = telephone[0]
                except:
                    telephone =[]

            
            for i in telephone:
                dbobj = xskh_pachong()
                dbobj.aname =aname
                if i !='1':
                    dbobj.telephone = i
                    dbobj.info =content
                    dbobj.http =http
                    dbobj.save(dbobj)
                    count = count + 1 
                else:
                    dbobj.telephone = ''.join(telephone) 
                    dbobj.info =content
                    dbobj.http =http
                    dbobj.save(dbobj)
                    count = count + 1
                    break
                    pass
    



        ptext = html.xpath('''//*[@id="page"]/a[@class='n']//text()''')
        purl = html.xpath('''//*[@id="page"]/a[@class='n']//@href''')

        pgtext = "".join(ptext)    
        if pgtext.find('下一页')<0:
            break
        if pgtext.find('上一页下一页')>0:
            pgurl = purl[1]
        else:
            pgurl = purl[0]
        url = baidu_url+pgurl
        #下一页地址
        response = requests.get(url, timeout=60,headers = headersParameters)  
    
    print(count)   


with open('地点.txt', mode='r+', encoding='utf-8') as faddr:
     addr = faddr.readlines()  

with open('行业.txt', mode='r+', encoding='utf-8') as fhangye:
     hangye = fhangye.readlines()   

with open('关键字.txt', mode='r+', encoding='utf-8') as fkey:
     key = fkey.readlines()   
        
for item_addr in addr:
    for item_hangye in hangye:
        for item_key in key:
            wd = item_addr.replace('\n',' ')+item_hangye.replace('\n',' ')+item_key.replace('\n',' ')
            get_by_word(wd)