爬虫:爬专利摘要及申请信息

导入库

import requests
from bs4 import BeautifulSoup as bs
import re
import time
# 列表转成字符串
def get_string(string_list):
    string = ""
    for s in string_list:
        s = s.replace('\r','')
        s = s.replace('\n','')
        s = s.replace(' ','')
        if len(str(s)) > 2:
            string += s[1:-1]
    return string
# cookie 自己补充
# 得到html的字符串
def get_html(num): # num : 页数
    html_list = []
    head = {
    "Host":"www1.soopat.com",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    "Referer":"http://www1.soopat.com/Home/Result?SearchWord=%E5%8A%A0%E7%83%AD%E5%99%A8&FMZL=Y&SYXX=Y&WGZL=Y&FMSQ=Y",
    "Cookie":"420.817; patentids=; __gads=ID=766c92f367f37ALNI_MZzf4w71dPXaGWJ_qvbDfhIABniEA; Hm_lvt_2b103433893a8cf930605886844fd95b=1591375407,1591525422; advu1=; advu2=; advu3=; advu4=; monitor_count=3; Hm_lpvt_2b103433893a8cf930605886844fd95b=1591525522"
    }
    for i in range(num):
        time.sleep(2)        # 防止访问太频繁,设置等待两秒
        r = requests.get("http://www1.soopat.com/Home/Result?SearchWord=%E5%8A%A0%E7%83%AD%E5%99%A8&FMZL=Y&SYXX=Y&WGZL=Y&FMSQ=Y&PatentIndex=" + str(i * 10),headers=head)
        r.encoding = r.apparent_encoding
        html_list.append(bs(r.text))
    return html_list
# 得到摘要及申请信息列表
def get_info(htmllist):
    abstract_l = []
    name_l = []
    for html in htmllist:
        name_list = html.find_all(class_ = "PatentAuthorBlock")
        abstract_list = html.find_all(class_ = "PatentContentBlock")
        s = re.compile(r'>.*?<', re.S)
        if len(name_list) == len(abstract_list):
            for i in range(len(name_list)):
                abstract = get_string(re.findall(s, str(abstract_list[i])))
                name = get_string(re.findall(s,str(name_list[i])))
                abstract_l.append(abstract)
                name_l.append(name)
    return abstract_l,name_l
# 获取3页
html_list = get_html(3)
abs_list, na_list = get_info(html_list)

在这里插入图片描述

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值