修改:
(1)修改了结果中存在乱码的问题;
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import time
import csv
def getHtml(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}
page = requests.get(url, headers=headers)
page.encoding = "gb2312" #注意点
html = page.text #注意点
#print(html)
soup = BeautifulSoup(html, 'html.parser')
for li in soup.find_all('div', class_="co_content8"):
for url_info in li.find_all('a', class_="ulink"):
moviename= url_info.get_text()
url_1= 'http://www.dytt8.net' + url_info['href']
#return moviename
#return url_1
#print(moviename)
#print(url_1)
#详情页
req2 = requests.get(url_1, headers=headers)
#print(req2.encoding) #ISO-8859-1 #查看网页返回的字符集类型
#print(req2.apparent_encoding) #GB2312 #自动判断字符集类型
req2.encoding = "gb2312"
#content2 = req2.content #注意区别(.content和.text)
content2 = req2.text
soup = BeautifulSoup(content2, 'html.parser')
for td in soup.find_all('td', attrs={'style': 'WORD-WRAP: break-word'}):
for url_2 in td.find_all('a'):
#print(req2.apparent_encoding)
#url_3 = url_2['href']
#url_3 = url_2.text
url_3=url_2.string
url_3.encoding = 'gbk' #注意点
#print(url_3)
#print(url_2.string)
item = { # 将获取的结果存储为字典
"moviename": moviename,
"movielink": url_1,
"ftplink": url_3
}
print(item)
save_result(item) # 每次获取一个结果后,存储一次
item.clear() # 存储后清空字典,为下次存储做准备
#存储
def save_result(item):
#保存在TXT
#with open('result.txt','a ')as f:
#f.write(json.dumps(content) + '\n')
#f.close()
#保存在csv中
'''with open('dy.csv', 'a', newline='') as csvfile: # 写入表头
writer = csv.writer(csvfile)
writer.writerow(['name', 'link', 'link3'])
with open('dy.csv', 'a', newline='',encoding='utf-8') as csvfile: # 打开一个csv文件,用于存储
fieldnames = ['name', 'link','link3']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(item)'''
with open('dy.csv', 'a', newline='',encoding='utf-8') as csvfile: # 写入表头
fieldnames = ['moviename', 'movielink', 'ftplink']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(item)
def main():
#url = "https://www.dytt8.net/html/gndy/dyzz/index.html"
#getHtml(url)
#翻页1
'''urls = ['https://www.dytt8.net/html/gndy/dyzz/list_23_1.html',
'https://www.dytt8.net/html/gndy/dyzz/list_23_2.html'
]
for url in urls:
getHtml(url)
time.sleep(2)'''
#翻页2
for i in range(1,4):
print('正在访问第'+format(i)+'页')
url ='https://www.dytt8.net/html/gndy/dyzz/list_23_'+ str(i)+'.html'
getHtml(url)
time.sleep(3)
if __name__ == '__main__':
main()