爬虫总结
以下是自己在做爬虫时积累的一些有用的积累,现在放出来一下
常见去处标签
单标签
str_html = re.sub('<input.*?>', '', str_html)
str_html = re.sub('<br.*?>', '\n', str_html)
str_html = re.sub('<br.*?/>', '\n', str_html)
str_html = re.sub('<hr.*?>', '\n', str_html)
str_html = re.sub('<hr.*?/>', '\n', str_html)
str_html = re.sub('<img.*?>', '', str_html, flags=re.S)
区间
str_html = re.sub('<script.*?>[\s\S]*?</script>', '', str_html, flags=re.S)
str_html = re.sub('<style.*?>[\s\S]*?</style>', '', str_html, flags=re.S)
str_html = re.sub('<form.*?>[\s\S]*?</form>', '', str_html, flags=re.S)
str_html = re.sub('<i.*?>[\s\S]*?</i>', '', str_html, flags=re.S)
str_html = re.sub('<figure.*?>[\s\S]*?</figure>', '', str_html, flags=re.S)
str_html = re.sub('<h.*?>[\s\S]*?</h\d*>', '', str_html, flags=re.S)
前后标签
str_html = re.sub('<div.*?>', '', str_html, flags=re.S)
str_html = re.sub('</div.*?>', '\n', str_html, flags=re.S)
str_html = re.sub('<p.*?>', '', str_html, flags=re.S)
str_html = re.sub('</p.*?>', '\n', str_html, flags=re.S)
str_html = re.sub('<strong.*?>', '', str_html, flags=re.S)
str_html = re.sub('</strong>', '', str_html, flags=re.S)
str_html = re.sub('<span.*?>', '', str_html, flags=re.S)
str_html = re.sub('</span>', '', str_html, flags=re.S)
str_html = re.sub('<a.*?>', '', str_html, flags=re.S)
str_html = re.sub('</a>', '', str_html, flags=re.S)
str_html = re.sub('<caption.*?>', '', str_html, flags=re.S)
str_html = re.sub('</caption>', '\n', str_html, flags=re.S)
str_html = re.sub('<iframe.*?>', '', str_html, flags=re.S)
str_html = re.sub('</iframe>', '\n', str_html, flags=re.S)
str_html = re.sub('<article.*?>', '', str_html, flags=re.S)
str_html = re.sub('</article>', '\n', str_html, flags=re.S)
嵌套标签
str_html = re.sub('<ul.*?>', '', str_html, flags=re.S)
str_html = re.sub('</ul>', '\n', str_html, flags=re.S)
str_html = re.sub('<ol.*?>', '', str_html, flags=re.S)
str_html = re.sub('</ol>', '\n', str_html, flags=re.S)
str_html = re.sub('<li.*?>', '', str_html, flags=re.S)
str_html = re.sub('</li.*?>', '\n', str_html, flags=re.S)
str_html = re.sub('<table.*?>', '', str_html, flags=re.S)
str_html = re.sub('</table>', '\n', str_html, flags=re.S)
str_html = re.sub('<tbody.*?>', '', str_html, flags=re.S)
str_html = re.sub('</tbody>', '\n', str_html, flags=re.S)
str_html = re.sub('<tr.*?>', '', str_html, flags=re.S)
str_html = re.sub('</tr>', '\n', str_html, flags=re.S)
str_html = re.sub('<th.*?>[\s\S]*</th>', '', str_html, flags=re.S)
str_html = re.sub('<td.*?>', '', str_html, flags=re.S)
str_html = re.sub('</td>', '', str_html, flags=re.S)
修饰标签
str_html = re.sub('<u.*?>', '', str_html, flags=re.S)
str_html = re.sub('</u>', '', str_html, flags=re.S)
str_html = re.sub('<em.*?>', '', str_html, flags=re.S)
str_html = re.sub('</em>', '', str_html, flags=re.S)
str_html = re.sub('<b.*?>', '', str_html, flags=re.S)
str_html = re.sub('</b>', '', str_html, flags=re.S)
str_html = re.sub('<del.*?>', '', str_html, flags=re.S)
str_html = re.sub('</del>', '', str_html, flags=re.S)
str_html = re.sub('<ins.*?>', '', str_html, flags=re.S)
str_html = re.sub('</ins>', '', str_html, flags=re.S)
str_html = re.sub('<sup.*?>', '', str_html, flags=re.S)
str_html = re.sub('</sup>', '', str_html, flags=re.S)
str_html = re.sub('<sub.*?>', '', str_html, flags=re.S)
str_html = re.sub('</sub>', '', str_html, flags=re.S)
str_html = re.sub('<small.*?>', '', str_html, flags=re.S)
str_html = re.sub('</small>', '', str_html, flags=re.S)
str_html = re.sub('<center.*?>', '', str_html, flags=re.S)
str_html = re.sub('</center>', '\n', str_html, flags=re.S)
str_html = re.sub('<aside.*?>', '', str_html, flags=re.S)
str_html = re.sub('</aside>', '\n', str_html, flags=re.S)
str_html = re.sub('<section.*?>', '', str_html, flags=re.S)
str_html = re.sub('</section>', '\n', str_html, flags=re.S)
str_html = re.sub('<noscript.*?>', '', str_html, flags=re.S)
str_html = re.sub('</noscript>', '\n', str_html, flags=re.S)
其他标签
str_html = re.sub('<font.*?>', '', str_html, flags=re.S)
str_html = re.sub('</font>', '', str_html, flags=re.S)
str_html = re.sub('<o:p>', '', str_html, flags=re.S)
str_html = re.sub('</o:p>', '', str_html, flags=re.S)
str_html = re.sub('<u1:p>', '', str_html, flags=re.S)
str_html = re.sub('</u1:p>', '', str_html, flags=re.S)
str_html = re.sub('<!--[\s\S]*?-->', '', str_html, flags=re.S)
xpath相关
匹配多标签
//*[contains(@class,"classname")]
python循环时
下一个标签,前面的xpath一定得加上./
否则,下一个标签如果再用//匹配规则
语法,则这个匹配规则会再次从头开始匹配,忽略上面循环找到的结果
xpath找出网页源码的方法
content_html_str = etree.tostring(你xpath的结果='utf-8', pretty_print=True, method='html').decode('utf-8')
print(content_html_str)
输出xpath多个标签文本的方法
先xpath语法找到标签,再使用string
b=a.xpath('.//div[contains(@class,"j_d_post_content")]')[0]
print(b)
print(b.xpath('string(.)'))
xpath选择没有属性的标签
tree.xpath('//div[not(@class)]')
存储的方法
def save_json(url,title,time,class_name,content,type,review,save_path):
dict_new = {}
dict_new["url"] = parse.unquote(url)
dict_new["title"] = title
dict_new["time"] = time
dict_new["class_name"] = class_name
content=[i for i in content if len(content)>2]
content = '<p>' + '<p><p>'.join(content) + '<p>'
content = re.sub('\s+',' ',content,flags=re.S)
dict_new['content'] = content
dict_new['type'] = type
review = ''
dict_new['review'] = review
json_a = json.dumps(dict_new, ensure_ascii=False)
with open(save_path, 'a+', encoding='utf-8')as f:
f.write(json_a + '\n')