Python爬取第一电影天堂最新电影（5000多部）代码实例（一）

本文链接：https://blog.csdn.net/zll_1234/article/details/105427385

1、实例代码：

#!/usr/bin/env python
#-*- coding: utf-8 -*-
#@Time    : 2020/4/7 16:28
#@Author  : zhangliangliang
#@File    : crawlerDemo3.py
#@Software: PyCharm
from urllib import request

from lxml import etree
import random
import requests,threading,datetime
from bs4 import BeautifulSoup

BASE_URL = "http://www.dytt8.net"

def readFile(path):
    content_list = []
    with open(path,'r') as f:
        for content in f:
            content_list.append(content.rstrip())
    return content_list

def writeFile(path,text):
    with open(path,'a') as f:
        f.write(text)
        f.write('\n')

def truncateFile(path):
    with open(path, 'w', encoding='utf-8') as f:
        f.truncate()

def getHeaders():
    user_agent_list = readFile("/Users/zll/pycharmProjects/studyPython/crawler_poject_base_part1/config/user_agent.txt")
    UserAgent = random.choice(user_agent_list)
    headers = {
  'User-Agent': UserAgent}
    print(headers)
    return headers

def getIp():
    ip_list = readFile('/Users/zll/pycharmProjects/studyPython/crawler_poject_base_part1/config/ip.txt')
    #print(ip_list)
    ip = random.choice(ip_list)
    print(ip)
    return ip

def checkip(targeturl,ip):
    headers =getHeaders()  #定制请求头
    proxies = {
  "http": "http://"+ip, "https": "https://"+ip}  #代理ip
    try:
        response=requests.get(url=targeturl,proxies=proxies,headers=headers,timeout=5).status_code
        if response == 200 :
            return True
        else:
            return False
    except:
        return False



def getProxies(url):
    ip = getIp()
    if checkip(url,ip) is True:
        proxies = {
  'http':'http://'+ip}
        print(proxies)
        return proxies
    else:
        return
    #print(proxies)

#def getProxies(url):


def get_detail_url(url):
    proxies = getProxies(url)
    header = getHeaders()
    try:
        response = requests.get(url, headers=header,proxies=proxies)
    #print(response.content.decode('gbk'))