自己编写的练习爬虫代码,希望对大家思路有帮助,技术欠佳,正在练习。加油!!
# -*- codeing = utf-8 -*-
# @Time : 2021/8/10 16:56
# @Author : yz
# @File : 论坛2.py
# @Software : PyCharm
# 引入request库, requests是python实现的简单易用的HTTP库,使用起来比urllib简洁很多
import requests
from lxml import etree
import re
def main():
head()
url = "http://114.112.74.132:8089/forum.php?mod=viewthread&tid="
askurl(url)
findUid = re.compile(r'uid=(\d+)')
max_tid = 5846
########判断url是否合法,是否可连通,HTTP状态码是否为200
def get_url_content(url):
response = requests.get(url)
if response.status_code == 200:
if "抱歉,指定的主题不存在或已被删除或正在被审核" in response.text:
return False
else:
# 如果可以连通返回网页源码
html = etree.HTML(response.text)
return html
else:
return False
# 定义方法解析html_text
def pare_post_data(html_text,tid):
title_list = html_text.xpath('//*[@id="thread_subject"]')
title = title_list[0].text
uid_list = html_text.xpath('//*[@id="favatar%s"]/div[1]/div/a/@href'%tid)
uid = str(uid_list[0])
uid = (re.findall(r"\d+\.?\d*", uid))[0]
post_content_info = {
"tid": tid,
"title": title,
"uid": uid,
}
return post_content_info
def head():
f = open(".\\luntan.txt", "w", encoding='utf-8')
f.write("tid" + "," + "title" + "," + "uid\n")
f.close()
#爬取所有网页并处理数据
def askurl(url):
for i in range(max_tid):
html = get_url_content(url + str(i))
if html != False:
tid = str(i)
pare_post_data1 = pare_post_data(html,tid)
get_tid = pare_post_data1.get("tid")
get_title = pare_post_data1.get("title")
get_uid = pare_post_data1.get("uid")
print(get_tid + ',' + get_title + ',' + get_uid)
f = open(".\\luntan.txt", "a+", encoding='utf-8')
f.write(get_tid + ',' + get_title + ',' + get_uid + '\n')
f.close()
if __name__ == "__main__":
print("tid"+","+"title"+","+"uid")
main()
print("爬取完毕!!!")