在有的爬取网页内容时候有时候遇到text得内容为空 如下图所示 这应该就是display:none的问题,遇到这样问题要改变css的style中的内容 这需要pyquery这个库
下面图是原始f12中的disply:block 显示内容 所以 爬取时候 内容一的到实际的内容为准 也就是requests.get(url = " ",headers = " ").text
不说太多上代码 下面有详细代码 本章用的是正则 的得到的 代码不懂可以评论 哈 最终结果 如图所示
import requests
import re
from pyquery import PyQuery as pq
""""
爬取sopu信息
url = http://www.soupu.com/pinpai/list.aspx?byt=6&syt=606&pptype=0
"""
class SoPu(object):
def __init__(self) -> None:
self.url = "http://www.soupu.com/pinpai/list.aspx?byt=6&syt=606&pptype=0"
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
def get_url_list(self):
response = requests.get(url=self.url,headers=self.header)
html = response.text
doc = pq(html)
if doc('.ctl00_main_NoDataPanel').attr:
doc('.NextPage').attr("display: block")
response = requests.get(url=self.url, headers=self.header)
html = response.text
# print(html)
return html
else:
print("无法获取本页面内容")
def get_content(self,html):
rE= re.compile(r'class="table_style2">(.*?)</p> </div>', re.M | re.S)
img_url_ls = re.compile(r"<img.*?src='(.*?)'.*? />", re.M | re.S)
name = re.compile(r"<img.*?alt='(.*?)'.*? />", re.M | re.S)
td_ls =re.compile(r" <td width='255'>(.*?)</td>", re.M | re.S)
adress = re.compile(r"<td width='170'>(.*?)</td>")
type = re.compile(r"<td width='170'>.*? <td>(.*?)</td>")
eara = re.compile(r"<td width='170'>.*? <td>.*?<td>(.*?)</td>")
extend = re.compile(r"<td width='255'>.*?<td width='255'>(.*?)</td>")
data = re.compile(r"<td width='170'>.*?<td width='170'>(.*?)</td>")
updata_data =re.compile(r"<p class='a999'>(.*?)</p>")
follow =re.compile(r"<span class='a_f_Georgia'>(.*?)</span>")
# print("rE",type(rE),"name",type(name))
ls = rE.findall(html)
for each in ls:
# print("each",each)
match_name = name.search(each)
if match_name != None:
Shop_name = match_name.group(1)
else:
Shop_name = '未知'
print('Shop_name:', Shop_name)
match_url = img_url_ls.search(each)
if match_url != None:
Shop_img_url = match_url.group(1)
else:
Shop_img_url = '未知'
print('Shop_img_url:', Shop_img_url)
match_td = td_ls.search(each)
if match_td != None:
Compay_namme = match_td.group(1)
else:
Compay_namme = '未知'
print('Compay_namme:', Compay_namme)
match_adress = adress.search(each)
if match_adress != None:
Compay_adress = match_adress.group(1)
else:
Compay_adress = '未知'
print('Compay_adress:', Compay_adress)
match_type = type.search(each)
if match_type != None:
Compay_type = match_type.group(1)
else:
Compay_type = '未知'
print('Compay_type:', Compay_type)
match_eara = eara.search(each)
if match_eara != None:
Compay_eara = match_eara.group(1)
else:
Compay_eara = '未知'
print('Compay_type:', Compay_eara)
match_extend = extend.search(each)
if match_extend != None:
Compay_extend = match_extend.group(1)
else:
Compay_extend = '未知'
print('Compay_extend:', Compay_extend)
match_data = data.search(each)
if match_data != None:
Compay_data = match_data.group(1)
else:
Compay_data = '未知'
print('Compay_data:', Compay_data)
match_updata_data = updata_data.search(each)
if match_updata_data != None:
Compay_updata_data = match_updata_data.group(1)
else:
Compay_updata_data = '未知'
print('Compay_updata_data:', Compay_updata_data)
match_follow = follow.search(each)
if match_follow != None:
Compay_follow = match_follow.group(1)
else:
Compay_follow = '未知'
print('Compay_follow:', Compay_follow)
def net_page(self):
pass
if __name__ == '__main__':
sopu = SoPu()
html =sopu.get_url_list()
sopu.get_content(html)