Python qichacha 上市公司 专利 爬虫

8 篇文章 0 订阅
4 篇文章 0 订阅
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from selenium import webdriver
import csv
import re
import numpy as np
import os

afterLogin_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'}
# driver = webdriver.PhantomJS(executable_path=r'D:\code\patent_info\phantomjs-2.1.1-windows\bin\phantomjs.exe', service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
chrome_driver = r'D:\code\patent_info\chromedriver.exe'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(r'--user-data-dir=D:\code\patent_info\ChromeUserqichacha0623')
driver = webdriver.Chrome(executable_path = chrome_driver, options=chrome_options)
# driver.maximize_window() 
# # driver.add_argument('--headless')  # 开启无界面模式 

def get_company_message(company):
    '''
    Input: company is a chinese word
    Todo: get company code and  patent url 
    Return: patent url 
    '''
    driver.get('https://www.qcc.com/search?key={}'.format(company))
    time.sleep(5)
    print('https://www.qcc.com/search?key={}'.format(company))
    html_page = driver.page_source
    soup = BeautifulSoup(html_page,features="lxml")
    href = soup.find_all('a',{'class': 'title'})[0].get('href')
    print('href is {}'.format(href))
    href2 = href.replace('firm', 'cassets')
    print('href2 is {}'.format(href2))
    return href2

def iselementExist(element):
    '''
    Input: xpath of patent_table element
    Todo: Whether the patent exists
    Return: bools
    '''
    flag = True
    try:
        driver.find_element_by_xpath(element)
        return flag
    except:
        flag=False
        return flag
    # //*[@id="zhuanlilist"]/div[1]/h3
def get_patent_infomation(number_page):# get information single web
    '''
    Input: href2 get from 'get_company_message' 
    Todo: get patents information from table
    Return: soup.select
    '''
    number_page = BeautifulSoup(number_page,features="lxml")
    data_infos = number_page.select('#zhuanlilist .app-ntable td')
    return data_infos

def save_patient(data_infos, key_company):
    '''
    Input: 1.data_infos: get one page information from 'get_patent_infomation'
           2.key_company: company name read from excel
    Todo: explain and write input to csv
    return: None
    '''
    company_patient = []
    for info in data_infos:
        company_patient.append(info.text)
    company_patient_classified =  [company_patient[i:i+10] for i in range(0,len(company_patient),10)]
    company_patient_classified = np.array(company_patient_classified) # 使用numpy中的array,将列表转化为标准的数组s
    dataframe = pd.DataFrame({  'company': key_company,
                                'Number': company_patient_classified[:,0],
                                'Patent_name': company_patient_classified[:,1],
                                'Patent_type': company_patient_classified[:,2],
                                'Patent_statu': company_patient_classified[:,3],
                                'Application number': company_patient_classified[:,4],
                                'Apply_data': company_patient_classified[:,5],
                                'Public_announcement_No': company_patient_classified[:,6],
                                'Public_announcement_Data': company_patient_classified[:,7],
                                'Inventor': company_patient_classified[:,8],
                                'More': company_patient_classified[:,9]
                                })
    if not os.path.exists('D:\code\patent_info\data\output_data\company_patient1.csv'):
        dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030')
    else:
        dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030',header=False)

def whether_turn_page(element1):
    '''
    Input: xpath of patent_table element
    Todo: Whether the pages_number table exists
    Return: bools
    '''
    flag1 = True
    try:
        driver.find_element_by_xpath(element1)
        return flag1
    except:
        flag1=False
        return flag1
  # //*[@id="zhuanlilist"]/div[4]/nav/ul

def turn_next_page(key_company):  
    '''
    Input: key_company: company name read from excel
    Todo: design how to turn next page in defferent condition
    return: key_company: company name read from excel
    '''
    print(driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text)
    list_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text
    # num_max = int(re.sub("\D", "", list_max)) # just keep int number
    if list_max == '>': # patent number between (10,70]
        list_second_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').text
        for turn_index in range(int(list_second_max) - 1):
            driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').click()
            time.sleep(1)
            number_page = driver.page_source
            data_infos = get_patent_infomation(number_page)
            save_patient(data_infos, key_company)
    elif int(re.sub("\D", "", list_max)):
    # elif isinstance(num_max, int):# patent number above 70
        for turn_index in range(int(re.sub("\D", "", list_max)) - 1):
            driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').click()
            time.sleep(1)
            number_page = driver.page_source
            data_infos = get_patent_infomation(number_page)
            save_patient(data_infos, key_company)
    else:
        print('error company name is {}'.format(key_company))

    #next_page_button //*[@id="zhuanlilist"]/div[4]/nav/ul/li[8]/a 
if __name__ == '__main__':
    csv_file = r"D:\code\patent_info\patient1.csv"
    with open(csv_file, encoding='utf-8') as csvfile:
        reader=csv.reader(csvfile)
        for i,key_company in enumerate(reader):
            print('i is {}'.format(i))
            print('rows is {}'.format(key_company))
            # print(type(key_company))
            key_company = ' '.join(key_company)
            patent_url = get_company_message(key_company) # patent url
            driver.get(patent_url)
            time.sleep(1)
            if iselementExist('//*[@id="zhuanlilist"]/div[1]/h3'):
                number_page = driver.page_source
                data_infos = get_patent_infomation(number_page)
                save_patient(data_infos, key_company)
                if whether_turn_page('//*[@id="zhuanlilist"]/div[4]/nav/ul'):
                    turn_next_page(key_company)
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
企查查爬虫是指使用Python编程语言编写的用于从企查查网站上获取数据的程序。企查查是一个提供企业信息查询服务的网站,通过企查查爬虫可以自动化地获取企业的基本信息、股东信息、法定代表人信息、注册资本等数据。 在Python中,可以使用第三方库如Requests、BeautifulSoup等来实现企查查爬虫。首先,你需要发送HTTP请求到企查查网站,并通过解析HTML页面来提取所需的数据。可以使用Requests库发送GET或POST请求,并使用BeautifulSoup库解析HTML页面。 以下是一个简单的示例代码,展示了如何使用Python进行企查查爬虫: ```python import requests from bs4 import BeautifulSoup def get_company_info(company_name): url = "https://www.qichacha.com/search?key=" + company_name headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") # 解析HTML页面,提取所需的数据 # 这里只是一个示例,具体的解析方法需要根据网页结构进行调整 company_info = soup.find("div", class_="search-ent").text return company_info # 调用函数获取企业信息 company_name = "阿里巴巴" info = get_company_info(company_name) print(info) ``` 请注意,企查查网站可能会有反爬虫机制,因此在编写爬虫程序时需要注意一些策略,如设置合适的请求头、使用代理IP等。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值