GO语言爬虫爬取豆瓣电影top250存入excel

package main

import (
"fmt"
"github.com/tealeg/xlsx"
"io/ioutil"
"net/http"
"regexp"
"strconv"
"time"
)
//定义新的数据类型
type Spider struct {
url string
header map[string]string
}


type Films struct {
rows string
name string
scores string
scores_pepoles string

}

//定义 Spider get的方法
func (keyword Spider) get_html_header() string {
client := &http.Client{}
req, err := http.NewRequest("GET", keyword.url, nil)
if err != nil {
}
for key, value := range keyword.header {
req.Header.Add(key, value)
}
resp, err := client.Do(req)
if err != nil {
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
}
return string(body)

}
func parse() {

//xlsx 创建xlsx
file := xlsx.NewFile()
sheet,_ := file.AddSheet("sheet1")
row := sheet.AddRow()
cell := row.AddCell()
cell.Value = "页码"
cell = row.AddCell()
cell.Value = "电影名称"
cell = row.AddCell()
cell.Value = "评分"
cell = row.AddCell()
cell.Value = "评价人数"


films := getFilms()
//add data
for _, film := range films{
row := sheet.AddRow()
rowsCell := row.AddCell()
rowsCell.Value = film.rows

nameCell := row.AddCell()
nameCell.Value = film.name

scoresCell := row.AddCell()
scoresCell.Value = film.scores

scores_pepolesCell := row.AddCell()
scores_pepolesCell.Value = film.scores_pepoles
}
err := file.Save("C:/file.xlsx")
if err != nil {
fmt.Printf(err.Error())
}
}

func getFilms()[]Films {
films := make([]Films, 0)
header := map[string]string{
"Host": "movie.douban.com",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Referer": "https://movie.douban.com/top250",
}
for i := 0; i < 10; i++ {
fmt.Println("正在抓取第" + strconv.Itoa(i) + "页......")
url := "https://movie.douban.com/top250?start=" + strconv.Itoa(i*25) + "&filter="
spider := &Spider{url, header}
html := spider.get_html_header()

//评价人数
pattern2 := `<span>(.*?)人评价</span>`
rp2 := regexp.MustCompile(pattern2)
find_txt2 := rp2.FindAllStringSubmatch(html, -1)

//评分
pattern3 := `property="v:average">(.*?)</span>`
rp3 := regexp.MustCompile(pattern3)
find_txt3 := rp3.FindAllStringSubmatch(html, -1)

//电影名称
pattern4 := `"title">([^&nbsp].*?)</span>`
rp4 := regexp.MustCompile(pattern4)
find_txt4 := rp4.FindAllStringSubmatch(html, -1)

for j := 0; j < len(find_txt2); j++ {
film := Films{}
film.rows = strconv.Itoa(i+1)
film.name = find_txt4[j][1]
film.scores = find_txt3[j][1]
film.scores_pepoles = find_txt2[j][1]
films = append(films, film)

}
}
return films

}






func main() {

t1 := time.Now() // get current time
parse()
elapsed := time.Since(t1)

fmt.Println("爬虫结束,总共耗时: ", elapsed)

}

转载于:https://www.cnblogs.com/niulanshandeniu/p/11277380.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
爬虫可以用来爬取豆瓣电影Top250的数据信息,并进行数据分析。在这个过程中,可以运用Python编程语言以及一些相关的库和工具,如pymysql、pandas、pyecharts和matplotlib。首先,使用Python编写爬虫程序,通过网络请求获取豆瓣电影Top250的数据,并将其写入CSV文件中。然后,可以使用pymysql库将数据存储到数据库中。接下来,可以使用pandas库对数据进行清洗和处理,以便后续分析。最后,可以使用pandas、pyecharts和matplotlib等工具对数据进行可视化,以便更好地理解和展示分析结果。这个过程中可能会遇到一些问题,例如IP被封锁或网站设置了IP访问次数限制。解决这些问题的方法可以是使用代理进行爬取,或者将评论和电影名称对应起来,进行数据标记和存储。另外,还可以考虑使用其他方式存储数据,如数据库,以便后续进一步分析和处理。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *2* [Python爬取、存储、分析、可视化豆瓣电影Top250](https://blog.csdn.net/m0_51873294/article/details/123837719)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] - *3* [【网络爬虫爬取豆瓣电影Top250评论](https://blog.csdn.net/Daycym/article/details/82787589)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值