import requests
from bs4 import BeautifulSoup as bs
import re
import time
# 列表转成字符串defget_string(string_list):
string =""for s in string_list:
s = s.replace('\r','')
s = s.replace('\n','')
s = s.replace(' ','')iflen(str(s))>2:
string += s[1:-1]return string
# cookie 自己补充# 得到html的字符串defget_html(num):# num : 页数
html_list =[]
head ={"Host":"www1.soopat.com","User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36","Referer":"http://www1.soopat.com/Home/Result?SearchWord=%E5%8A%A0%E7%83%AD%E5%99%A8&FMZL=Y&SYXX=Y&WGZL=Y&FMSQ=Y","Cookie":"420.817; patentids=; __gads=ID=766c92f367f37ALNI_MZzf4w71dPXaGWJ_qvbDfhIABniEA; Hm_lvt_2b103433893a8cf930605886844fd95b=1591375407,1591525422; advu1=; advu2=; advu3=; advu4=; monitor_count=3; Hm_lpvt_2b103433893a8cf930605886844fd95b=1591525522"}for i inrange(num):
time.sleep(2)# 防止访问太频繁,设置等待两秒
r = requests.get("http://www1.soopat.com/Home/Result?SearchWord=%E5%8A%A0%E7%83%AD%E5%99%A8&FMZL=Y&SYXX=Y&WGZL=Y&FMSQ=Y&PatentIndex="+str(i *10),headers=head)
r.encoding = r.apparent_encoding
html_list.append(bs(r.text))return html_list
# 得到摘要及申请信息列表defget_info(htmllist):
abstract_l =[]
name_l =[]for html in htmllist:
name_list = html.find_all(class_ ="PatentAuthorBlock")
abstract_list = html.find_all(class_ ="PatentContentBlock")
s = re.compile(r'>.*?<', re.S)iflen(name_list)==len(abstract_list):for i inrange(len(name_list)):
abstract = get_string(re.findall(s,str(abstract_list[i])))
name = get_string(re.findall(s,str(name_list[i])))
abstract_l.append(abstract)
name_l.append(name)return abstract_l,name_l