一、效果图
二、源码
''' 测试内容页爬取'''
def test_content_url(self):
try:
url = self.test_url_var.get().strip()
items = self.content_tree.get_children('')
content = self.get_html(url)
content_dict = {}
self.test_text.delete(1.0, END)
for item in items:
value = self.content_tree.item(item).get('values')
if value[4] == 0:
print(value)
if value[5] == 0:
# substring
return_value = self.deal_with_sustring(content, value[1], value[2])
if value[6]:
return_value = self.request_again(url, return_value, value[6])
if value[7]:
exec_content = value[7].format(return_value)
return_value = self.deal_with_python(exec_content)
return_value = self.c
content_dict[value[0]] = return_value
self.test_text.insert(END, value[0] + ': ' + return_value + '\n')
else:
# re
pattern = re.findall(value[3], content, re.I|re.M)
if pattern:
pattern_value = pattern[0]
else:
pattern_value = ''
if value[6]:
pattern_value = self.request_again(url, pattern_value, value[6])
if value[7]:
exec_content = value[7].format(pattern_value)
return_value = self.deal_with_python(exec_content)
self.test_text.insert(END, value[0] + ': ' + pattern_value + '\n')
content_dict[value[0]] = pattern_value
else:
print('%s在列表页提取' % value[0])
print(content_dict)
except Exception as e:
print(e)
self.test_text.insert(END, '错误信息:' + str(e))
有需要源码的可以评论哦~