原来的方案:
1.获取远端资源,得到总条数
2.然后根据总页数通过协程采集剩余数据,保存数据
代码如下:
func Testxx(e echo.Context) error {
var req servers.ResourceReq
if err := utils.BindAndValidate(e, &req); err != nil {
return err
}
// 获取采集配置
req.PageAndLimit()
if err := req.UpdateSuperResource(e); err != nil {
return err
}
if req.Limit == 0 {
req.Limit = 30
}
var cfg models.CollectionConfig
if err := database.GetGormDb().First(&cfg).Error; err != nil {
return err
}
if _, err := req.CheckCategoryBind(); err != nil {
return err
}
req.Action = "videolist"
var (
data *servers.CelebrityDataJson
err error
)
data, err = servers.GetCelebrityDataFromRemoteResource(req)
if err != nil {
return err
}
// 处理采集数据
if data == nil || data.Total == 0 {
return errors.New("没有可以采集的数据")
}
go testSave(data.List, 1)
for i := int64(2); i <= data.PageCount; i++ {
// 采集后续页码
go func(rq servers.ResourceReq, pg int64) {
defer func() {
if err := recover(); err != nil {
fmt.Println(err)
}
}()
rq.Page = pg
var (
vodList *servers.CelebrityDataJson
err error
)
vodList, err = servers.GetCelebrityDataFromRemoteResource(rq)
if err != nil {
return
}
fmt.Printf("结果:%v", vodList)
testSave(vodList.List, pg)
}(req, i)
}
return e.JSON(http.StatusOK, servers.TakeVodResp{
Message: "提交成功",
LogId: 1,
})
}
// 保存
func testSave(celebrites []servers.CelebrityColumn, i int64) {
for _, cel := range celebrites {
fmt.Printf("切片长度:%d", len(celebrites))
fmt.Println(fmt.Sprintf("第%d页:%s", i, cel.Name))
}
}
func GetCelebrityDataFromRemoteResource(req ResourceReq) (*CelebrityDataJson, error) {
switch req.Op {
case "day":
req.Hours = 24
case "week":
req.Hours = cast.ToInt64(getWeekIndex() * 24)
case "month":
req.Hours = 30 * 24
}
db := database.GetGormDb()
// 查询资源信息
var resource models.Collection
if err := db.Where("id=? and data_type=?", req.ResourceID, CollectPerson).First(&resource).Error; errors.Is(err, gorm.ErrRecordNotFound) {
return nil, errors.New("影人资源库不存在")
} else if err != nil {
return nil, err
}
// 拼接资源地址
if resource.Address == "" {
return nil, errors.New("资源地址有误")
}
if !strings.HasPrefix(resource.Address, "https://") && !strings.HasPrefix(resource.Address, "http://") {
resource.Address = fmt.Sprintf("http://%s", resource.Address)
}
val, _ := query.Values(req)
requestUrl := fmt.Sprintf("%s?%s", resource.Address, val.Encode())
if resource.Param != "" {
requestUrl = fmt.Sprintf("%s&%s", requestUrl, resource.Param)
}
fmt.Println("##########采集地址##############\n", requestUrl)
res, err := utils.HttpGet(requestUrl)
if err != nil {
return nil, errors.New("采集影人资源出错:" + err.Error())
}
if resource.DataMethod == XMl {
xmlData, err := parseCelebrityXml(res)
if err != nil {
return nil, errors.New("解析影人Xml格式出错,请确认是否是XML格式")
}
// 将XML的采集结果,组装成json返回给前端
class := make([]ClassJson, len(xmlData.Class.Type))
for i, cl := range xmlData.Class.Type {
class[i] = ClassJson{
ID: cl.ID,
Name: cl.Name,
}
}
return &CelebrityDataJson{
PageCount: xmlData.List.PageCount,
Total: xmlData.List.RecordCount,
Page: xmlData.List.Page,
Limit: xmlData.List.PageSize,
Class: class,
List: xmlData.List.Data,
}, nil
}
// Json
return parseCelebrityJson(res)
}
最终的结果:是采集数据错乱,很多重复的数据,第n页和第m页数据一样;
开始怀疑是保存时,切片并发的问题,然后通过代码代码测试切片没有问题
测试代码如下:
/**
测试切片并发问题
*/
// 生成一个切片
func getSlice(i int) []int {
r := make([]int, 10)
for j := 0; j < 10; j++ {
r[j] = (i-1)*10 + j + 1
}
return r
}
func test_getSlice() {
fmt.Println(getSlice(1))
fmt.Println(getSlice(2))
fmt.Println(getSlice(3))
}
func sliceParallet() {
i := 10
for i > 0 {
go func(i int) {
arr := getSlice(i)
printSlice(arr)
printSliceFor(arr)
}(i)
i--
}
time.Sleep(5 * time.Second)
}
func printSlice(arr []int) {
fmt.Println(arr)
}
func printSliceFor(arr []int) {
for _, i := range arr {
fmt.Println(i)
}
}
func main() {
//sliceParallet()
dataParallet()
}
type data struct {
List []int
}
func getData(i int) *data {
time.Sleep(1 * time.Second)
list := getSlice(i)
return &data{List: list}
}
func testGetData() {
fmt.Println(getData(1))
fmt.Println(getData(2))
fmt.Println(getData(3))
}
func dataParallet() {
fmt.Println("测试打印struct")
i := 10
d := getData(1)
//printSlice(d.List)
printSliceFor(d.List)
for i > 1 {
go func(i int) {
dt := getData(i)
//printSlice(dt.List)
printSliceFor(dt.List)
}(i)
i--
}
time.Sleep(11 * time.Second)
}
最后经过仔细研究代码:这种写法的切片没有问题
问题的症结:
是并发请求采集资源时的并发问题,导致同一时间返回了相同数据;
也曾怀疑是返回类型是指针,导致的;通过测试返回结构体也还是有这个问题;
解决办法是在请求资源的地方进行加锁处理:
var clLock sync.Mutex
func GetCelebrityDataFromRemoteResource(req ResourceReq) (*CelebrityDataJson, error) {
clLock.Lock() // 防止并发导致数据错乱
defer clLock.Unlock()
switch req.Op {
case "day":
req.Hours = 24
case "week":
req.Hours = cast.ToInt64(getWeekIndex() * 24)
case "month":
req.Hours = 30 * 24
}
db := database.GetGormDb()
// 查询资源信息
var resource models.Collection
if err := db.Where("id=? and data_type=?", req.ResourceID, CollectPerson).First(&resource).Error; errors.Is(err, gorm.ErrRecordNotFound) {
return nil, errors.New("影人资源库不存在")
} else if err != nil {
return nil, err
}
// 拼接资源地址
if resource.Address == "" {
return nil, errors.New("资源地址有误")
}
if !strings.HasPrefix(resource.Address, "https://") && !strings.HasPrefix(resource.Address, "http://") {
resource.Address = fmt.Sprintf("http://%s", resource.Address)
}
val, _ := query.Values(req)
requestUrl := fmt.Sprintf("%s?%s", resource.Address, val.Encode())
if resource.Param != "" {
requestUrl = fmt.Sprintf("%s&%s", requestUrl, resource.Param)
}
fmt.Println("##########采集地址##############\n", requestUrl)
res, err := utils.HttpGet(requestUrl)
if err != nil {
return nil, errors.New("采集影人资源出错:" + err.Error())
}
if resource.DataMethod == XMl {
xmlData, err := parseCelebrityXml(res)
if err != nil {
return nil, errors.New("解析影人Xml格式出错,请确认是否是XML格式")
}
// 将XML的采集结果,组装成json返回给前端
class := make([]ClassJson, len(xmlData.Class.Type))
for i, cl := range xmlData.Class.Type {
class[i] = ClassJson{
ID: cl.ID,
Name: cl.Name,
}
}
return &CelebrityDataJson{
PageCount: xmlData.List.PageCount,
Total: xmlData.List.RecordCount,
Page: xmlData.List.Page,
Limit: xmlData.List.PageSize,
Class: class,
List: xmlData.List.Data,
}, nil
}
// Json
return parseCelebrityJson(res)
}
修正:这个问题是资源端并发时,修改了单例属性导致…,为了提高效率,将属性每次请求都重新生成。