python爬取网易新闻_Python 爬虫实例(4)—— 爬取网易新闻

#coding:utf-8

importrandom, reimportsqlite3importjsonfrom bs4 importBeautifulSoupimportsys

reload(sys)

sys.setdefaultencoding('utf-8')importuuidimportrequests

session=requests.session()defmd5(str):importhashlib

m=hashlib.md5()

m.update(str)returnm.hexdigest()defwangyi():for i in range(1,3):if i ==1:

k= ""

else:

k= "_0" +str(i)

url= "http://temp.163.com/special/00804KVA/cm_yaowen" + k + ".js?callback=data_callback"

printurl

headers={"Host":"temp.163.com","Connection":"keep-alive","Accept":"*/*","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER","Referer":"http://news.163.com/","Accept-Encoding":"gzip, deflate, sdch","Accept-Language":"zh-CN,zh;q=0.8",

}

result= session.get(url=url,headers=headers).texttry:

result1= eval(eval((json.dumps(result)).replace('data_callback(','').replace(')','').replace(' ','')))except:pass

try:for i inresult1:

tlink= i['tlink']

headers2={"Host":"news.163.com","Connection":"keep-alive","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER","Accept-Encoding":"gzip, deflate, sdch","Accept-Language":"zh-CN,zh;q=0.8",

}print "tlinktlinktlinktlink",tlink

return_data= session.get(url=tlink,headers=headers2).texttry:

soup= BeautifulSoup(return_data, 'html.parser')

returnSoup= soup.find_all("div", attrs={"id": "endText"})[0]printreturnSoupprint "==============================="

try:

returnList= re.findall('

(.*?)

',str(returnSoup))

content1= ''.join(returnList)except:

content1=""

try:

returnList1= re.findall('

(.*?)

',str(returnSoup))

content2= ''.join(returnList1)except:

content2=""content= content1 +content2except:

content= ""cx= sqlite3.connect("C:\\Users\\xuchunlin\\PycharmProjects\\study\\db.sqlite3", check_same_thread=False)

cx.text_factory=strtry:print "正在插入链接 %s 数据" %(url)

tlink= i['tlink']

title= (i['title']).decode('unicode_escape')

commenturl= i['commenturl']

tienum= i['tienum']

opentime= i['time']printtitleprinttlinkprintcommenturlprinttienumprintopentimeprintcontent

url2=md5(str(tlink))

cx.execute("INSERT INTO wangyi (title,tlink,commenturl,tienum,opentime,content,url)VALUES (?,?,?,?,?,?,?)",(str(title), str(tlink), str(commenturl), str(tienum), str(opentime), str(content), str(url2)))exceptException as e:printeprint "cha ru shi bai"cx.commit()

cx.close()except:passwangyi()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值