提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
前言
提示:这里可以添加本文要记录的大概内容:
例如:随着人工智能的不断发展,机器学习这门技术也越来越重要,很多人都开启了学习机器学习,本文就介绍了机器学习的基础内容。
提示:以下是本篇文章正文内容,下面案例可供参考
一、要求
二、使用步骤
1.引入库
代码如下(示例):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
2.maoyanspider.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import MaoyanItem
import urllib
class MaoyanspiderSpider(scrapy.Spider):
name = 'maoyanspider'
allowed_domains = ['maoyan.com']
start_urls = ['https://maoyan.com/board/4']
def parse(self, response):
dls = response.xpath("//dl[@class='board-wrapper']/dd")
for dl in dls:
item = MaoyanItem()
item['name'] = dl.xpath("div[@class='board-item-main']/div[@class='board-item-content']/div[@class='movie-item-info']/p[@class='name']/a/text()").extract_first()
item['actors'] = dl.xpath("div[@class='board-item-main']/div[@class='board-item-content']/div[@class='movie-item-info']/p[@class='star']/text()").extract_first().strip()
item['releasetime'] = dl.xpath("div[@class='board-item-main']/div[@class='board-item-content']/div[@class='movie-item-info']/p[@class='releasetime']/text()").extract_first()
yield item
next_page = response.xpath('//div[@class="pager-main"]/ul/li/a[contains(text(), "下一页")]/@href').extract_first()
if next_page is not None:
new_link = urllib.parse.urljoin(response.url, next_page)
yield scrapy.Request(new_link, callback=self.parse)
3.items.py
import scrapy
class MaoyanItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
actors = scrapy.Field()
releasetime = scrapy.Field()
4.pipelines.py
import pymysql, csv
class MaoyanPipeline(object):
def process_item(self, item, spider):
data_list = [item['name'], item['actors'], item['releasetime']]
head = ('company', 'salary', 'address', 'experience', 'education', 'number_people')
with open('maoyan.csv', 'a+', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
# writer.writerow(head) # 写入表头 也就是文件标题
writer.writerow(data_list)
return item
class MaoyanMysqlPipeline(object):
def open_spider(self, spider):
print('爬虫开始执行')
self.db = pymysql.connect(host='localhost', user='root',
password='123456', database='test', port=3306, charset='utf8')
# 执行语句,游标对象
self.cursor = self.db.cursor()
self.df = open("maoyan.csv", "w", newline="")
def process_item(self, item, spider):
t = (item['name'], item['actors'], item['releasetime'])
sql = 'insert into maoyan values (%s, %s, %s)'
self.cursor.execute(sql, t)
self.db.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.db.close()
print('退出爬虫')