参考链接:https://blog.csdn.net/weixin_42866931/article/details/110558601
参考链接:https://blog.csdn.net/weixin_41173374/article/details/100053060
练习反爬链接:http://glidedsky.com/level/crawler-css-puzzle-1
......
url = "http://glidedsky.com/level/web/crawler-css-puzzle-1?page={0}"
results = 0
for page_num in range(1, 1001):
await asyncio.wait_for(page.goto(url.format(page_num)), timeout=100000)
content = await page.content()
results += await self.parse_detail(content)
print(results)
async def parse_detail(self, content):
soup = BeautifulSoup(content, 'lxml')
div_list = soup.find_all('div', {'class': 'col-md-1'})
css_info = str(soup.find_all('style')[0]) # 含有css编码的字符床
result = 0
for i in div_list:
raw_div = i.find_all('div')
result += await self.parse_numbers(raw_div, css_info)
print(f"每一页的结果: {result}".rjust(60, '-'))
return result
async def parse_offset(self, raw_div, css_info):
global pattern_opacity, pattern_left
num_list = ["0", "0", "0"]
count = len(raw_div)
if count == 4:
real_div = raw_div[1:]
elif count == 3:
real_div = raw_div
else:
raise ValueError("严重致命错误,real_div长度匹配不上")
class_name_msg = ''
for index, div in enumerate(real_div):
# print("当前标签的num:{} index:{} div:{} real_div:{} div_list:{}".format(num, index, div, real_div, raw_div))
class_name = real_div[index].get('class')[0]
num = real_div[index].text
class_name_msg += f"{class_name}:{num} "
# 一:是否为透明状态
is_opacity = re.search(pattern_opacity % class_name, css_info, re.S)
# 透明度判断
if is_opacity:
num_list[index] = num
continue
# 二:偏移情况
is_left = re.search(pattern_left % class_name, css_info, re.S)
# 顺序完全一致的情况下,可以添加
if not is_left:
# print("匹配不到偏移量,请检查是否真的是出现不偏移的情况,当前的class_name:{} num:{} num[{}]={}".format(class_name, num, index, content))
num_list[index] = num
continue
# 三:数值出现异位情况
else:
left_num = int(is_left.group()) if is_left else None
num_list[index + left_num] = num
return "".join(num_list), class_name_msg
async def parse_numbers(self, raw_div, css_info, class_name=None):
global pattern
if len(raw_div) < 3:
# 取最后一个. 长度为1和2都是适用的
class_name = raw_div[-1].get('class')[0]
num = re.search(pattern % class_name, css_info, re.S).group()
elif len(raw_div) == 3 or len(raw_div) == 4:
num, class_name = await self.parse_offset(raw_div, css_info)
else:
raise ValueError("raw_div长度找不到匹配项")
print(f"值:{num} Class属性:{class_name}")
return int(num)
输出:
值:232 Class属性:bcM1etkO:3 seRSd2bXog:2 NM3UVsq:2
值:348 Class属性:fQzid5Bwlu
值:152 Class属性:aPIxB7Hcf:2 my8jMh:5 hV9gZx:1
值:428 Class属性:wbc10oiXmr
值:324 Class属性:sQYj11AXSE
值:270 Class属性:zA12JKn
值:281 Class属性:fkUrj14iYYz
值:252 Class属性:ioFK15NVF
值:163 Class属性:Eox17fUQwR:3 OWd18Emweg:1 UA19uCsTk:6
值:149 Class属性:nAZ21ISD:4 MKeI22SGT:9 iuSB23gJhu:1
值:207 Class属性:OC24tyZGF
值:260 Class属性:ipQ25RjzN:0 qOQ26CIOi:2 SMiL27Tiifb:6
------------------------------------------------每一页的结果: 3066
值:349 Class属性:bcM1etkO:4 seRSd2bXog:3 NM3UVsq:9
值:269 Class属性:fQzid5Bwlu
值:346 Class属性:aPIxB7Hcf:6 my8jMh:4 hV9gZx:3
值:437 Class属性:wbc10oiXmr
值:256 Class属性:sQYj11AXSE
值:183 Class属性:zA12JKn
值:379 Class属性:fkUrj14iYYz
值:282 Class属性:ioFK15NVF
值:300 Class属性:Eox17fUQwR:0 OWd18Emweg:3 UA19uCsTk:0
值:351 Class属性:nAZ21ISD:5 MKeI22SGT:1 iuSB23gJhu:3
值:165 Class属性:OC24tyZGF
值:154 Class属性:ipQ25RjzN:4 qOQ26CIOi:1 SMiL27Tiifb:5
------------------------------------------------每一页的结果: 3471