python电影天堂爬虫

#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests
import pymysql
from lxml import etree
from fake_useragent import UserAgent

# Mysql Connect
connection = pymysql.connect(host='localhost',
                             user='root',
                             password='0825',
                             db='test',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)


# Common Request(Get)
def get_request(url, save=''):
    header = {
        "UserAgent": UserAgent().random}
    try:
        r = requests.get(url, headers=header)
        if save != '':
            with open(save, "w", encoding="utf-8") as f:
                f.write(r.text)
        r.encoding = 'gbk'
        html = etree.HTML(r.text)
        return html
    except():
        return get_request(url)


# Spider
def spider(url):
    # Xpath
    index = get_request(url, "index.html")
    href_list = index.xpath("//div[@class='co_content8']/ul/td/table/tr[2]/td[2]/b/a[2]/@href")
    i = 0
    while i < len(href_list):
        href_list[i] = "http://www.ygdy8.net/" + href_list[i]
        i = i + 1

    for v in href_list:
        html = get_request(v)
        title = html.xpath("//*[@class='title_all']/h1/font/text()")
        images = html.xpath("//*[@border='0']/@src")
        cover = ""
        if len(images) > 1:
            cover = images[1]
        img = ""
        if len(images) > 2:
            img = images[2]
        intro = html.xpath("//*[@id='Zoom']/td[1]/p[1]/text()")
        link = html.xpath("//*[@style='WORD-WRAP: break-word']/a/@href")
        print(title[0])
        with connection.cursor() as cursor:
            sql = "SELECT * FROM `movie` where `title` = %s"
            have = cursor.execute(sql, (title[0]))
            if have == 0:
                sql = "INSERT INTO `movie` (`title`, `cover`, `img`, `intro`, `link`) VALUES (%s, %s, %s, %s, %s)"
                cursor.execute(sql, (title[0], cover, img, "".join(intro), link[0]))
                connection.commit()


# 国内电影
i = 1
while i <= 203:
    url = "http://www.ygdy8.net/html/gndy/china/list_4_%d.html" % (i,)
    print(url)
    spider(url)
    i = i + 1

 

数据库结构:

DROP TABLE IF EXISTS `movie`;
CREATE TABLE `movie`  (
  `id` int(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `cover` varchar(500) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `img` varchar(500) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `intro` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
  `link` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 359 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值