#!/usr/bin/python
# -*- coding:utf-8 -*-
import requests
import pymysql
from lxml import etree
from fake_useragent import UserAgent
# Mysql Connect
connection = pymysql.connect(host='localhost',
user='root',
password='0825',
db='test',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
# Common Request(Get)
def get_request(url, save=''):
header = {
"UserAgent": UserAgent().random}
try:
r = requests.get(url, headers=header)
if save != '':
with open(save, "w", encoding="utf-8") as f:
f.write(r.text)
r.encoding = 'gbk'
html = etree.HTML(r.text)
return html
except():
return get_request(url)
# Spider
def spider(url):
# Xpath
index = get_request(url, "index.html")
href_list = index.xpath("//div[@class='co_content8']/ul/td/table/tr[2]/td[2]/b/a[2]/@href")
i = 0
while i < len(href_list):
href_list[i] = "http://www.ygdy8.net/" + href_list[i]
i = i + 1
for v in href_list:
html = get_request(v)
title = html.xpath("//*[@class='title_all']/h1/font/text()")
images = html.xpath("//*[@border='0']/@src")
cover = ""
if len(images) > 1:
cover = images[1]
img = ""
if len(images) > 2:
img = images[2]
intro = html.xpath("//*[@id='Zoom']/td[1]/p[1]/text()")
link = html.xpath("//*[@style='WORD-WRAP: break-word']/a/@href")
print(title[0])
with connection.cursor() as cursor:
sql = "SELECT * FROM `movie` where `title` = %s"
have = cursor.execute(sql, (title[0]))
if have == 0:
sql = "INSERT INTO `movie` (`title`, `cover`, `img`, `intro`, `link`) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(sql, (title[0], cover, img, "".join(intro), link[0]))
connection.commit()
# 国内电影
i = 1
while i <= 203:
url = "http://www.ygdy8.net/html/gndy/china/list_4_%d.html" % (i,)
print(url)
spider(url)
i = i + 1
数据库结构:
DROP TABLE IF EXISTS `movie`;
CREATE TABLE `movie` (
`id` int(10) UNSIGNED NOT NULL AUTO_INCREMENT,
`title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`cover` varchar(500) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`img` varchar(500) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`intro` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
`link` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 359 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;