爬取爱问知识人,问题及回答

主要源码:
aiwen_spider.py:
import scrapy
from aiwen.items import AiwenItem

class aiwenSpider( scrapy.Spider):
name = “aiwen”
allowed_domains = “/iask.sina.com”
start_urls = [
https://iask.sina.com.cn/c/80-goodAnswer-1-new.html
]
def parse(self,response):
link = response.xpath(’//div[@class=“list-body-con current”]/ul/li/div/div[@class=“question-title”]/a/@href’).extract()
user_agent = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134’
headers = {‘User-Agent’: user_agent}
for a in link:
url = ‘https://iask.sina.com.cn’+a
# print(url)
yield scrapy.Request(url,callback=self.content,dont_filter=True,headers=headers)
link1= response.xpath(’//div[@class=“page mt30”]/a/@href’).extract()
for b in link1:
url1 = ‘https://iask.sina.com.cn’+b
# print(url1)
yield scrapy.Request(url1,callback=self.parse,dont_filter=True,headers=headers)
def content(self,response):
print(response)
question = response.xpath(’//p[@class=“title-text”]/text()’).extract()
answer = response.xpath(’//div[@class=“new-answer-text new-answer-cut new-pre-answer-text”]/pre/text()’).extract()
item = AiwenItem()
for question1 in question:
item[‘question’] = question1
print(item[‘question’])
for answer1 in answer:
item[‘answer’] = answer1
print(item[‘answer’])

        yield item
        # f = open('aiwen.txt', 'a+')
        # item['question']=str(item['question'])
        # item['answer']=str(item['answer'] )
        #
        # f.write(item['answer'] + '\n\n')
        # f.close()

item.py

-- coding: utf-8 --

Define here the models for your scraped items

See documentation in:

https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class AiwenItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
question = scrapy.Field()
answer = scrapy.Field()

pass

main.py
#coding=utf-8
from scrapy import cmdline
cmdline.execute(“scrapy crawl aiwen”.split())
pipeline.py

-- coding: utf-8 --

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql
class AiwenPipeline(object):
def init(self):
self.conn = pymysql.connect(host=‘localhost’,
user=‘root’,
password=‘123’,
db=‘test’,
charset=‘utf8’
)
cursor = self.conn.cursor()
cursor.execute(“DROP TABLE IF EXISTS aiwen”)
sql = “”“CREATE TABLE aiwen(aiwen text(1000) )”""
cursor.execute(sql)

def process_item(self, item, spider):
    cursor = self.conn.cursor()
    cursor.execute( "INSERT INTO aiwen(aiwen) VALUES ('%s');" % (  pymysql.escape_string(item['answer'])))
    self.conn.commit()

    return item

学习总结:
1.在这个小任务中,巩固了scrapy框架的使用,同时也掌握了xpath的使用

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值