python获取企库信息

最新推荐文章于 2021-09-20 20:12:45 发布

一杯丨凉白开

最新推荐文章于 2021-09-20 20:12:45 发布

阅读量134

点赞数

文章标签： python

本文链接：https://blog.csdn.net/weixin_49215012/article/details/107483455

版权

python获取企库信息

from datetime import time
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver


city=''
# 请求头
header={
    'accept':'*/*',
    'accept-encoding':'gzip, deflate, br',
    'accept-language':'zh-CN,zh;q=0.9',
    'origin':'http://www.qeecoo.com',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}

# 查询地区公司单页
def findone(num,city):
    cityurl='http://www.qeecoo.com/'+city+'企业黄页-'+city+'企业名录'
    citynumlist = []
    for one in range(num):
        if(one+1==1):
            findproduct(cityurl)
            citynumlist.append(cityurl)
        else:
            findproduct(cityurl+'_'+str(one+1))
            citynumlist.append(cityurl+'_'+str(one+1))
    return citynumlist

# 查询地区公司总页数
def findallpage(city):
    url='http://www.qeecoo.com/'+city+'企业黄页-'+city+'企业名录'
    resp = requests.get(url,headers=header)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.content,'html.parser')
    title = soup.select('.last')
    soup1 = BeautifulSoup(str(title[0]), "html.parser")
    sum = 0
    for i in soup1.find_all('a'):
        sum = i['href'].split("_",1)[1]
    return int(sum)

# 查询单个公司
def productone(urlp):
	# 这里我没写了，发现网页数据不是我想要的数据。所以不爬了
    print(urlp)

# 查询一页公司路径
def findproduct(citynumone):
    productone = requests.get(citynumone,headers=header)
    productone.encoding = 'utf-8'
    soup = BeautifulSoup(productone.content,'html.parser')
    title = soup.select('.list')
    soup1 = BeautifulSoup(str(title), "html.parser")
    pp = 0
    # 根据a连接查找公司名称，每第二个a连接是地址连接所以我这里过滤了一下
    for i in soup1.find_all('a'):
        pp+=1
        urlp = ''
        if(pp%2==0):
            continue
        else:
            urlp='http://www.qeecoo.com/'+i['href']
            productone(urlp)

# 主方法
if __name__ == '__main__':
	# 输入要查询的地区
    city=input('请输入：'+city)
    # 总页数+地区->查询一页公司名单
    findone(findallpage(city),city)