目标网站:http://bohaishibei.com/post/category/main/(一个很有趣的网站,一段话配一个图,老有意思了~)网站形式如下:
爬取标题和图片链接
import re
import requests
url="https://bh.sb/post/category/main/"
r=requests.get(url)
print (r.status_code)
content=r.content
#print(content)
pattern=re.compile(r'<h2><a.*?title=(.*?) - 博海拾贝.*?<\h2>.*?src="(.*?)".*?class',re.S)
#itle=re.findall(pattern,content.decode('utf-8'))
title = re.findall(pattern,content.decode('utf-8'))
for i in title:
print(i)
本来是
import re
import requests
url="https://bh.sb/post/category/main/"
r=requests.get(url)
print (r.status_code)
content=r.content
#print(content)
pattern=re.compile(r'<h2><a.*?title=(.*?) - 博海拾贝.*?<\h2>.*?src="(.*?)class',re.S)#后面直接跟class
#itle=re.findall(pattern,content.decode('utf-8'))
title = re.findall(pattern,content.decode('utf-8'))
for i in title:
print(i)