python 实现百度关键字自动爬虫

# coding: utf-8

# In[3]:

import requests
from lxml import etree
import re
from sqlalchemy import create_engine


engine = create_engine('sqlite:///xskh_pachong.sqlite',echo=True)

from sqlalchemy import Table,Column,Integer,String,ForeignKey
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()
DBSession = sessionmaker(bind=engine)
session = DBSession()
class xskh_pachong(Base):
    __tablename__="xskh_pachong"
    id = Column(Integer,primary_key=True)
    aname = Column(String(128))
    telephone = Column(String(126))
    info = Column(String(400))
    http = Column(String(128))
    
    @classmethod
    def save(cls,data):
        session.add(data)
        session.commit()
        return data.id
    

Base.metadata.create_all(engine)  


import requests
from lxml import etree
import re
from sqlalchemy import create_engine
def get_aname_and_telephone(url,company_name,telephone):
    if url == '':
        return
    try:
        response = requests.get(url)
    except:
        return
    try:
        content = response.content.decode('gbk','strict')
    except:
        content = response.content.decode('utf-8','strict')
    
    html = etree.HTML(content)

    #..//html/body//span//text()
    a = html.xpath('''..//html/body//span//text()''')
    pattern_mob = re.compile('1[3|4|5|7|8]\d{9}')
    telephone.append(pattern_mob.findall("".join(a)))

    #..//html/body//a//text()
    b =  html.xpath('''..//html/body//text()''')
    pattern_company = re.compile('[\u4e00-\u9fa5]{5,23}[司厂]{1}')
    company_name.append(pattern_company.findall("".join(b)))



import requests
from lxml import etree
import re
from sqlalchemy import create_engine
def get_by_word(wd):
    
    headersParameters = {    #发送HTTP请求时的HEAD信息,用于伪装为浏览器
            'Connection': 'Keep-Alive',
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
        }

    # pn = 换页 wd 是关键字
#     wd = '佛山 装饰 联系人'
    pn = 0;
    baidu_url = 'https://www.baidu.com'
    houzui = '/s?wd={}&pn={}'
    url = baidu_url+houzui.format(wd,pn)

    response = requests.get(url, timeout=60,headers = headersParameters)
    count = 1
    while True:  
        content = response.content.decode() 
        html = etree.HTML(content)
        #//div[@id="content_left"]/div[@id="1"]/h3//text()  
        #//div[@id="content_left"]/div[@id="1"]//div[@class="c-abstract"]//text()
        #//div[@id="content_left"]/div[@id="10"]//h3//text()
        #//div[@id="content_left"]/div/@id

        items = html.xpath('''//div[@id="content_left"]/div/@id''')
        for i in items:
            a = html.xpath('''//div[@id="content_left"]/div[@id="{}"]//h3//text()'''.format(i))
            b = html.xpath('''//div[@id="content_left"]/div[@id="{}"]//div[@class="c-abstract"]//text()'''.format(i))
            c = html.xpath('''//div[@id="content_left"]/div[@id="{}"]/h3/a/@href'''.format(i))

            aname = "".join(a)
            content =aname+' '+ "".join(b)
            http = "".join(c)


            pattern_mob = re.compile('1[3|4|5|7|8]\d{9}')
            telephone = pattern_mob.findall(content)
            pattern_company = re.compile('[\u4e00-\u9fa5]{5,23}[司厂]{1}')
            company_name = pattern_company.findall(content)

            if "".join(company_name)!="":
                aname = company_name[0]
            else:
                pass
                try:
                    get_aname_and_telephone(http,company_name,telephone)
                    company_name = company_name[0]
                    telephone = telephone[0]
                    aname = companay_name[0]
                except:
                    aname = ''


            if "".join(telephone)!="":
                pass
            else:
                pass
                print(http,company_name,telephone)
                try:
                   get_aname_and_telephone(http,company_name,telephone)
                   company_name = company_name[0]
                   telephone = telephone[0]
                except:
                    telephone =[]

            
            for i in telephone:
                dbobj = xskh_pachong()
                dbobj.aname =aname
                if i !='1':
                    dbobj.telephone = i
                    dbobj.info =content
                    dbobj.http =http
                    dbobj.save(dbobj)
                    count = count + 1 
                else:
                    dbobj.telephone = ''.join(telephone) 
                    dbobj.info =content
                    dbobj.http =http
                    dbobj.save(dbobj)
                    count = count + 1
                    break
                    pass
    



        ptext = html.xpath('''//*[@id="page"]/a[@class='n']//text()''')
        purl = html.xpath('''//*[@id="page"]/a[@class='n']//@href''')

        pgtext = "".join(ptext)    
        if pgtext.find('下一页')<0:
            break
        if pgtext.find('上一页下一页')>0:
            pgurl = purl[1]
        else:
            pgurl = purl[0]
        url = baidu_url+pgurl
        #下一页地址
        response = requests.get(url, timeout=60,headers = headersParameters)  
    
    print(count)   


with open('地点.txt', mode='r+', encoding='utf-8') as faddr:
     addr = faddr.readlines()  

with open('行业.txt', mode='r+', encoding='utf-8') as fhangye:
     hangye = fhangye.readlines()   

with open('关键字.txt', mode='r+', encoding='utf-8') as fkey:
     key = fkey.readlines()   
        
for item_addr in addr:
    for item_hangye in hangye:
        for item_key in key:
            wd = item_addr.replace('\n',' ')+item_hangye.replace('\n',' ')+item_key.replace('\n',' ')
            get_by_word(wd)    

  • 1
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Listest

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值