时隔已久,终于决定写自己的第二个爬虫,和第一个爬虫没有什么区别,前者是图片,现在的是文章,用到的库有requests,re
代码如下
import re
import requests
import re
import requests
def getpages(url):
headers = {'Cookie': 'gr_user_id=2e9f3f44-e4ba-466f-9588-966a9c3e3bc4; grwng_uid=cad3b972-f836-413f-96dd-07dac52149e6; UM_distinctid=1671a5f4101181-00ddc2c5bb38ed-37664109-144000-1671a5f41055f; JSESSIONID=B4C4952E75BDA4DB1373545CEA6BF187; safedog-flow-item=DF5FEB703783F399A4D8599B2E17DFC5'}
r = requests.get(url,headers = headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def getnews1(html,infolist):
s = re.findall(r'href=".*?".*?title=".*?"',html)
for i in range(len(s)):
news = eval((s[i].split(' ')[0]).split('=')[1] + '=' + (s[i].split(' ')[0]).split('=