爬虫工作中的总结

爬虫总结

以下是自己在做爬虫时积累的一些有用的积累,现在放出来一下

常见去处标签

单标签

str_html = re.sub('<input.*?>', '', str_html)
str_html = re.sub('<br.*?>', '\n', str_html)
str_html = re.sub('<br.*?/>', '\n', str_html)
str_html = re.sub('<hr.*?>', '\n', str_html)
str_html = re.sub('<hr.*?/>', '\n', str_html)
str_html = re.sub('<img.*?>', '', str_html, flags=re.S)

区间

str_html = re.sub('<script.*?>[\s\S]*?</script>', '', str_html, flags=re.S)
str_html = re.sub('<style.*?>[\s\S]*?</style>', '', str_html, flags=re.S)
str_html = re.sub('<form.*?>[\s\S]*?</form>', '', str_html, flags=re.S)
str_html = re.sub('<i.*?>[\s\S]*?</i>', '', str_html, flags=re.S)
str_html = re.sub('<figure.*?>[\s\S]*?</figure>', '', str_html, flags=re.S)
str_html = re.sub('<h.*?>[\s\S]*?</h\d*>', '', str_html, flags=re.S)

前后标签

str_html = re.sub('<div.*?>', '', str_html, flags=re.S)
str_html = re.sub('</div.*?>', '\n', str_html, flags=re.S)

str_html = re.sub('<p.*?>', '', str_html, flags=re.S)
str_html = re.sub('</p.*?>', '\n', str_html, flags=re.S)

str_html = re.sub('<strong.*?>', '', str_html, flags=re.S)
str_html = re.sub('</strong>', '', str_html, flags=re.S)

str_html = re.sub('<span.*?>', '', str_html, flags=re.S)
str_html = re.sub('</span>', '', str_html, flags=re.S)

str_html = re.sub('<a.*?>', '', str_html, flags=re.S)
str_html = re.sub('</a>', '', str_html, flags=re.S)

str_html = re.sub('<caption.*?>', '', str_html, flags=re.S)
str_html = re.sub('</caption>', '\n', str_html, flags=re.S)

str_html = re.sub('<iframe.*?>', '', str_html, flags=re.S)
str_html = re.sub('</iframe>', '\n', str_html, flags=re.S)

str_html = re.sub('<article.*?>', '', str_html, flags=re.S)
str_html = re.sub('</article>', '\n', str_html, flags=re.S)

嵌套标签

str_html = re.sub('<ul.*?>', '', str_html, flags=re.S)
str_html = re.sub('</ul>', '\n', str_html, flags=re.S)
str_html = re.sub('<ol.*?>', '', str_html, flags=re.S)
str_html = re.sub('</ol>', '\n', str_html, flags=re.S)
str_html = re.sub('<li.*?>', '', str_html, flags=re.S)
str_html = re.sub('</li.*?>', '\n', str_html, flags=re.S)

str_html = re.sub('<table.*?>', '', str_html, flags=re.S)
str_html = re.sub('</table>', '\n', str_html, flags=re.S)
str_html = re.sub('<tbody.*?>', '', str_html, flags=re.S)
str_html = re.sub('</tbody>', '\n', str_html, flags=re.S)
str_html = re.sub('<tr.*?>', '', str_html, flags=re.S)
str_html = re.sub('</tr>', '\n', str_html, flags=re.S)
str_html = re.sub('<th.*?>[\s\S]*</th>', '', str_html, flags=re.S)
str_html = re.sub('<td.*?>', '', str_html, flags=re.S)
str_html = re.sub('</td>', '', str_html, flags=re.S)

修饰标签

str_html = re.sub('<u.*?>', '', str_html, flags=re.S)
str_html = re.sub('</u>', '', str_html, flags=re.S)
str_html = re.sub('<em.*?>', '', str_html, flags=re.S)
str_html = re.sub('</em>', '', str_html, flags=re.S)
str_html = re.sub('<b.*?>', '', str_html, flags=re.S)
str_html = re.sub('</b>', '', str_html, flags=re.S)
str_html = re.sub('<del.*?>', '', str_html, flags=re.S)
str_html = re.sub('</del>', '', str_html, flags=re.S)
str_html = re.sub('<ins.*?>', '', str_html, flags=re.S)
str_html = re.sub('</ins>', '', str_html, flags=re.S)
str_html = re.sub('<sup.*?>', '', str_html, flags=re.S)
str_html = re.sub('</sup>', '', str_html, flags=re.S)
str_html = re.sub('<sub.*?>', '', str_html, flags=re.S)
str_html = re.sub('</sub>', '', str_html, flags=re.S)
str_html = re.sub('<small.*?>', '', str_html, flags=re.S)
str_html = re.sub('</small>', '', str_html, flags=re.S)
str_html = re.sub('<center.*?>', '', str_html, flags=re.S)
str_html = re.sub('</center>', '\n', str_html, flags=re.S)
str_html = re.sub('<aside.*?>', '', str_html, flags=re.S)
str_html = re.sub('</aside>', '\n', str_html, flags=re.S)
str_html = re.sub('<section.*?>', '', str_html, flags=re.S)
str_html = re.sub('</section>', '\n', str_html, flags=re.S)
str_html = re.sub('<noscript.*?>', '', str_html, flags=re.S)
str_html = re.sub('</noscript>', '\n', str_html, flags=re.S)

其他标签

str_html = re.sub('<font.*?>', '', str_html, flags=re.S)
str_html = re.sub('</font>', '', str_html, flags=re.S)
str_html = re.sub('<o:p>', '', str_html, flags=re.S)
str_html = re.sub('</o:p>', '', str_html, flags=re.S)
str_html = re.sub('<u1:p>', '', str_html, flags=re.S)
str_html = re.sub('</u1:p>', '', str_html, flags=re.S)

str_html = re.sub('<!--[\s\S]*?-->', '', str_html, flags=re.S)

xpath相关

匹配多标签

//*[contains(@class,"classname")]

python循环时

下一个标签,前面的xpath一定得加上./

否则,下一个标签如果再用//匹配规则语法,则这个匹配规则会再次从头开始匹配,忽略上面循环找到的结果

xpath找出网页源码的方法

content_html_str = etree.tostring(你xpath的结果='utf-8', pretty_print=True, method='html').decode('utf-8')
print(content_html_str)

输出xpath多个标签文本的方法

先xpath语法找到标签,再使用string

b=a.xpath('.//div[contains(@class,"j_d_post_content")]')[0]
print(b)
print(b.xpath('string(.)'))

xpath选择没有属性的标签

tree.xpath('//div[not(@class)]')

存储的方法

def save_json(url,title,time,class_name,content,type,review,save_path):
    dict_new = {}
    dict_new["url"] = parse.unquote(url)
    dict_new["title"] = title
    dict_new["time"] = time
    dict_new["class_name"] = class_name
    content=[i for i in content if len(content)>2]
    content = '<p>' + '<p><p>'.join(content) + '<p>'
    content = re.sub('\s+',' ',content,flags=re.S)
    dict_new['content'] = content
    dict_new['type'] = type
    review = ''
    dict_new['review'] = review
    json_a = json.dumps(dict_new, ensure_ascii=False)
    with open(save_path, 'a+', encoding='utf-8')as f:
        f.write(json_a + '\n')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值