快科技的抓取

最新推荐文章于 2024-08-20 13:26:57 发布

chengjintao1121

最新推荐文章于 2024-08-20 13:26:57 发布

阅读量201

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/chengjintao1121/article/details/85330220

版权

爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

import requests
import time,json,re,pymysql
from lxml import etree
article_id_list=[608862]
def ID_last(article_id_list):
time_now = int((time.time()) * 1000)
headers = {
“User-Agent”: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36’,}
url = “http://blog.mydrivers.com/news/getdatalist.aspx?minid={}&callback=NewsList&={}".format(article_id_list[-1],time_now)
response = requests.get(url, headers=headers)
response.encoding = (‘utf8’)
text = response.text
dict_1 = re.findall(r’’‘NewsList((.*?))’’’, text)[0]
dicts = json.loads(dict_1)
list_id = dicts[‘Table_minid’][0][‘minid’]
#print(repr(list_id))
article_id_list.append(list_id)
m=0
while m<108:
ID_last(article_id_list)
print(m)
m+=1
tt=0
ss=0
for url_info in article_id_list:
try:
time_now = int((time.time()) * 1000)
url = "http://blog.mydrivers.com/news/getdatalist.aspx?minid={}&callback=NewsList&={}”.format(url_info,time_now)
headers = {“User-Agent”: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36’, }
response = requests.get(url, headers=headers)
response.encoding = (‘utf8’)
text = response.text
dict_1 = re.findall(r’’‘NewsList((.?))’’’, text)[0]
dicts = json.loads(dict_1)
value_dict=dicts[“dt”]
for one in value_dict:
#print(one)
try:
title=one[“SimTitle”]
except:
title=“空”
try:
url_info=“http:”+one[“Url”]
except:
url_info=“空”
try:
author=one[“Editor”]
except:
author=‘空’
try:
creat_time=str(one[“year”])+"-"+str(one[“month”])+"-"+str(one[“day”])+" “+str(one[“hour”])+”:"+str(one[“minute”])
except:
creat_time=“空”
try:
comment_count=one[‘ReviewCount’]
except:
comment_count=“空”
try:
support_count=one[‘Support’]
except:
support_count=“空”
print(creat_time, title, url_info, author, comment_count, support_count)
try:
response_info=requests.get(url_info,headers=headers)
response_info.encoding=(“utf-8”)
text_info=response_info.text
res_lxml = etree.HTML(text_info)
try:
#print(text_info)
label_1=re.findall(r’’’<div class=“bqian”(.?)</div’’’,text_info)[0]
label=re.findall(r’’’>(.*?)<’’’,label_1)
if len(label)>0:
for i in label:
if i in ["",’ ‘]:
label.remove(i)
except:
label=“空”
print(label)
try:
article=res_lxml.xpath(’’’//div[@class=“news_info”]/p/text()|//div[@class=“news_info”]/p/strong/text()|//div[@class=“news_info”]/p/strong/span/text()’’’)
real_article="".join(article)
article_count=len(real_article)
except:
real_article=“空”
article_count=0
try:
img= res_lxml.xpath(’’’//div[@class=“news_info”]/p/a/img’’’)
img_count = len(img)
except:
img_count=0
try:
source=res_lxml.xpath(’’’//html/body/div[6]/div[1]/div[1]/div[3]/div[1]/a[1]/text()’’’)
print(source)
except:
source=“空”
except:
img_count=0
source = “空”
article_count = 0
real_article = “空”
label = “空”
try:
db = pymysql.connect(host=‘127.0.0.1’, user=‘root’, password=‘123456’, database=‘key_word’, charset=‘utf8’)
# 创建游标对象
cursor = db.cursor()
# 数据添加到数据库的语句
if article_count > 40:
# sql = “insert into title_1 values(null,{},{},{},{})”.format(article_title,source,comment,datetime)
sql = ‘’‘insert into kuai_ke_ji_1 values(null,"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")’’’ % (
title, source, creat_time, url_info, real_article, article_count, img_count, label,support_count,comment_count)
else:
sql = ‘’‘insert into kuai_ke_ji values(null,"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")’’’ % (
title, source, creat_time, url_info, real_article, article_count, img_count, label,support_count,comment_count)
# 执行添加过程
cursor.execute(sql)
# 提交
db.commit()
# 关闭游标
cursor.close()
db.close()
print(tt)
tt+=1
except:
pass
except:
pass
ss+=1
print(ss)
#/html/body/div[6]/div[1]/div[1]/div[3]/div[1]/a[1]