1、36壁纸网爬虫配置参数实现页面可以自行配置
This commit is contained in:
parent
7c2e1b684d
commit
f2a6a46d29
|
|
@ -0,0 +1,26 @@
|
||||||
|
package com.ruoyi.system.api;
|
||||||
|
|
||||||
|
import com.ruoyi.common.core.constant.ServiceNameConstants;
|
||||||
|
import com.ruoyi.common.core.domain.R;
|
||||||
|
import com.ruoyi.system.api.factory.RemoteConfigFallbackFactory;
|
||||||
|
import org.springframework.cloud.openfeign.FeignClient;
|
||||||
|
import org.springframework.web.bind.annotation.GetMapping;
|
||||||
|
import org.springframework.web.bind.annotation.PathVariable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 系统配置服务
|
||||||
|
* @author xiejs
|
||||||
|
* @since 2022-02-20
|
||||||
|
*/
|
||||||
|
@FeignClient(contextId = "remoteConfigService",
|
||||||
|
value = ServiceNameConstants.SYSTEM_SERVICE,
|
||||||
|
fallbackFactory = RemoteConfigFallbackFactory.class)
|
||||||
|
public interface RemoteConfigService {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 根据参数键名查询参数值
|
||||||
|
*/
|
||||||
|
@GetMapping(value = "/config/configKeyForRPC/{configKey}")
|
||||||
|
R<String> getConfigKeyForRPC(@PathVariable("configKey") String configKey);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
package com.ruoyi.system.api.factory;
|
||||||
|
|
||||||
|
import com.ruoyi.common.core.domain.R;
|
||||||
|
import com.ruoyi.system.api.RemoteConfigService;
|
||||||
|
import lombok.extern.log4j.Log4j2;
|
||||||
|
import org.springframework.cloud.openfeign.FallbackFactory;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author xiejs
|
||||||
|
* @since 2022-02-20
|
||||||
|
*/
|
||||||
|
@Component
|
||||||
|
@Log4j2
|
||||||
|
public class RemoteConfigFallbackFactory implements FallbackFactory<RemoteConfigService> {
|
||||||
|
@Override
|
||||||
|
public RemoteConfigService create(Throwable cause) {
|
||||||
|
return new RemoteConfigService() {
|
||||||
|
@Override
|
||||||
|
public R<String> getConfigKeyForRPC(String configKey) {
|
||||||
|
log.error("系统配置服务调用失败:{}", cause.getMessage());
|
||||||
|
return R.fail("系统配置服务调用失败");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
|
org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
|
||||||
com.ruoyi.system.api.factory.RemoteUserFallbackFactory,\
|
com.ruoyi.system.api.factory.RemoteUserFallbackFactory,\
|
||||||
com.ruoyi.system.api.factory.RemoteLogFallbackFactory, \
|
com.ruoyi.system.api.factory.RemoteLogFallbackFactory, \
|
||||||
com.ruoyi.system.api.factory.RemoteFileFallbackFactory
|
com.ruoyi.system.api.factory.RemoteFileFallbackFactory,\
|
||||||
|
com.ruoyi.system.api.factory.RemoteConfigFallbackFactory
|
||||||
|
|
|
||||||
|
|
@ -1,18 +1,7 @@
|
||||||
package com.ruoyi.system.controller;
|
package com.ruoyi.system.controller;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import javax.servlet.http.HttpServletResponse;
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
|
||||||
import org.springframework.validation.annotation.Validated;
|
|
||||||
import org.springframework.web.bind.annotation.DeleteMapping;
|
|
||||||
import org.springframework.web.bind.annotation.GetMapping;
|
|
||||||
import org.springframework.web.bind.annotation.PathVariable;
|
|
||||||
import org.springframework.web.bind.annotation.PostMapping;
|
|
||||||
import org.springframework.web.bind.annotation.PutMapping;
|
|
||||||
import org.springframework.web.bind.annotation.RequestBody;
|
|
||||||
import org.springframework.web.bind.annotation.RequestMapping;
|
|
||||||
import org.springframework.web.bind.annotation.RestController;
|
|
||||||
import com.ruoyi.common.core.constant.UserConstants;
|
import com.ruoyi.common.core.constant.UserConstants;
|
||||||
|
import com.ruoyi.common.core.domain.R;
|
||||||
import com.ruoyi.common.core.utils.poi.ExcelUtil;
|
import com.ruoyi.common.core.utils.poi.ExcelUtil;
|
||||||
import com.ruoyi.common.core.web.controller.BaseController;
|
import com.ruoyi.common.core.web.controller.BaseController;
|
||||||
import com.ruoyi.common.core.web.domain.AjaxResult;
|
import com.ruoyi.common.core.web.domain.AjaxResult;
|
||||||
|
|
@ -23,6 +12,12 @@ import com.ruoyi.common.security.annotation.RequiresPermissions;
|
||||||
import com.ruoyi.common.security.utils.SecurityUtils;
|
import com.ruoyi.common.security.utils.SecurityUtils;
|
||||||
import com.ruoyi.system.domain.SysConfig;
|
import com.ruoyi.system.domain.SysConfig;
|
||||||
import com.ruoyi.system.service.ISysConfigService;
|
import com.ruoyi.system.service.ISysConfigService;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.validation.annotation.Validated;
|
||||||
|
import org.springframework.web.bind.annotation.*;
|
||||||
|
|
||||||
|
import javax.servlet.http.HttpServletResponse;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 参数配置 信息操作处理
|
* 参数配置 信息操作处理
|
||||||
|
|
@ -76,6 +71,16 @@ public class SysConfigController extends BaseController
|
||||||
return AjaxResult.success(configService.selectConfigByKey(configKey));
|
return AjaxResult.success(configService.selectConfigByKey(configKey));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 根据参数键名查询参数值forRpc
|
||||||
|
* @since 2022-02-20
|
||||||
|
* @Author xjs
|
||||||
|
*/
|
||||||
|
@GetMapping(value = "/configKeyForRPC/{configKey}")
|
||||||
|
public R<String> getConfigKeyForRPC(@PathVariable String configKey) {
|
||||||
|
return R.ok(configService.selectConfigByKey(configKey));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新增参数配置
|
* 新增参数配置
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,7 @@ public class RedisConst {
|
||||||
/**
|
/**
|
||||||
* 爬虫记录循环次数常量信息
|
* 爬虫记录循环次数常量信息
|
||||||
*/
|
*/
|
||||||
public static final String REPTILE_COUNT= "reptile:count";
|
public static final String REPTILE_COUNT= "reptile:_36wallpaper.count";
|
||||||
|
|
||||||
//-------------------有效时间-----------------------
|
//-------------------有效时间-----------------------
|
||||||
public static final Integer TRAN_DICT_EXPIRE = 1; //小时
|
public static final Integer TRAN_DICT_EXPIRE = 1; //小时
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
package com.xjs._36wallpaper.task;
|
package com.xjs._36wallpaper.task;
|
||||||
|
|
||||||
import com.xjs._36wallpaper.service._36wallpaperService;
|
|
||||||
import com.xjs._36wallpaper.webmagic._36wallpaperProcessor;
|
import com.xjs._36wallpaper.webmagic._36wallpaperProcessor;
|
||||||
import com.xjs.annotation.ReptileLog;
|
import com.xjs.annotation.ReptileLog;
|
||||||
import lombok.extern.log4j.Log4j2;
|
import lombok.extern.log4j.Log4j2;
|
||||||
|
|
@ -21,8 +20,6 @@ public class _36wallpaperTask {
|
||||||
@Autowired
|
@Autowired
|
||||||
private _36wallpaperProcessor wallpaperProcessor;
|
private _36wallpaperProcessor wallpaperProcessor;
|
||||||
|
|
||||||
@Autowired
|
|
||||||
private _36wallpaperService wallpaperService;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -31,12 +28,7 @@ public class _36wallpaperTask {
|
||||||
*/
|
*/
|
||||||
@ReptileLog(name = "36壁纸网", url = _36_WALLPAPER_URL)
|
@ReptileLog(name = "36壁纸网", url = _36_WALLPAPER_URL)
|
||||||
public Long reptileWallpaper() {
|
public Long reptileWallpaper() {
|
||||||
Long run = wallpaperProcessor.run();
|
return wallpaperProcessor.run();
|
||||||
//删除重复数据
|
|
||||||
int count = wallpaperService.deleteRepeatData();
|
|
||||||
log.info("36壁纸删除重复数据数:" + count);
|
|
||||||
|
|
||||||
return run;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
package com.xjs._36wallpaper.webmagic;
|
package com.xjs._36wallpaper.webmagic;
|
||||||
|
|
||||||
import cn.hutool.core.collection.CollUtil;
|
|
||||||
import com.xjs._36wallpaper.pojo._36wallpaper;
|
|
||||||
import com.xjs._36wallpaper.service._36wallpaperService;
|
import com.xjs._36wallpaper.service._36wallpaperService;
|
||||||
import lombok.extern.log4j.Log4j2;
|
import lombok.extern.log4j.Log4j2;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
|
@ -10,8 +8,6 @@ import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* webmagic持久化保存
|
* webmagic持久化保存
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
|
|
@ -27,11 +23,13 @@ public class _36wallpaperPipeline implements Pipeline {
|
||||||
@Override
|
@Override
|
||||||
public void process(ResultItems resultItems, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
|
|
||||||
List<_36wallpaper> wallpaperData = resultItems.get("_36wallpaperData");
|
//这种方法效率低
|
||||||
|
|
||||||
|
/*List<_36wallpaper> wallpaperData = resultItems.get("_36wallpaperData");
|
||||||
|
|
||||||
if (CollUtil.isNotEmpty(wallpaperData)) {
|
if (CollUtil.isNotEmpty(wallpaperData)) {
|
||||||
wallpaperService.saveBatch(wallpaperData, 25);
|
wallpaperService.saveBatch(wallpaperData, 25);
|
||||||
}
|
}*/
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,12 @@
|
||||||
package com.xjs._36wallpaper.webmagic;
|
package com.xjs._36wallpaper.webmagic;
|
||||||
|
|
||||||
|
import com.alibaba.fastjson.JSONObject;
|
||||||
import com.ruoyi.common.redis.service.RedisService;
|
import com.ruoyi.common.redis.service.RedisService;
|
||||||
|
import com.ruoyi.system.api.RemoteConfigService;
|
||||||
import com.xjs._36wallpaper.pojo._36wallpaper;
|
import com.xjs._36wallpaper.pojo._36wallpaper;
|
||||||
|
import com.xjs._36wallpaper.service._36wallpaperService;
|
||||||
import lombok.extern.log4j.Log4j2;
|
import lombok.extern.log4j.Log4j2;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
|
@ -32,17 +36,51 @@ import static com.xjs.consts.ReptileConst._36_WALLPAPER_URL;
|
||||||
*/
|
*/
|
||||||
@Log4j2
|
@Log4j2
|
||||||
@Component
|
@Component
|
||||||
|
|
||||||
public class _36wallpaperProcessor implements PageProcessor {
|
public class _36wallpaperProcessor implements PageProcessor {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 是否全网爬虫
|
* 是否全网爬虫
|
||||||
*/
|
*/
|
||||||
private static boolean init = false;
|
private boolean init = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 是否下载图片带磁盘
|
||||||
|
*/
|
||||||
|
private boolean downloadImg = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 图片保存到磁盘的路径
|
||||||
|
*/
|
||||||
|
private String path = "D:\\Dev\\WebCrawler\\36wallpaper";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* redis的key
|
||||||
|
*/
|
||||||
|
public static final String REDIS_KEY = "sys_config:xjs.webmagic._36wallpaper";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 系统配置表中的key
|
||||||
|
*/
|
||||||
|
public static final String CONFIG_KEY = "xjs.webmagic._36wallpaper";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 请求头key
|
||||||
|
*/
|
||||||
private static final String headerKey = "User-Agent";
|
private static final String headerKey = "User-Agent";
|
||||||
|
/**
|
||||||
|
* 请求头value
|
||||||
|
*/
|
||||||
private static final String headerValue = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36";
|
private static final String headerValue = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36";
|
||||||
|
|
||||||
|
|
||||||
|
private static RemoteConfigService remoteConfigService;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
public void setRemoteConfigService(RemoteConfigService remoteConfigService) {
|
||||||
|
_36wallpaperProcessor.remoteConfigService = remoteConfigService;
|
||||||
|
}
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
private _36wallpaperPipeline wallpaperPipeline;
|
private _36wallpaperPipeline wallpaperPipeline;
|
||||||
|
|
||||||
|
|
@ -54,31 +92,46 @@ public class _36wallpaperProcessor implements PageProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*private static _36wallpaperService wallpaperService;
|
private static _36wallpaperService wallpaperService;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
public void setWallpaperService(_36wallpaperService wallpaperService) {
|
public void setWallpaperService(_36wallpaperService wallpaperService) {
|
||||||
_36wallpaperProcessor.wallpaperService = wallpaperService;
|
_36wallpaperProcessor.wallpaperService = wallpaperService;
|
||||||
}*/
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 图片保存到磁盘的路径
|
* 初始化参数
|
||||||
*/
|
*/
|
||||||
private static final String path = "D:\\Dev\\WebCrawler\\36wallpaper";
|
private void initParameter() {
|
||||||
|
//判断redis中是否存在
|
||||||
|
Boolean hasKey = redisService.hasKey(REDIS_KEY);
|
||||||
|
if (hasKey) {
|
||||||
|
String cacheObject = redisService.getCacheObject(REDIS_KEY);
|
||||||
|
JSONObject json = JSONObject.parseObject(cacheObject);
|
||||||
|
this.init = json.getBoolean("init");
|
||||||
|
this.downloadImg = json.getBoolean("downloadImg");
|
||||||
|
this.path = json.getString("path");
|
||||||
|
} else if (StringUtils.isNotEmpty(remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData())) {
|
||||||
|
String data = remoteConfigService.getConfigKeyForRPC(CONFIG_KEY).getData();
|
||||||
|
JSONObject json = JSONObject.parseObject(data);
|
||||||
|
this.init = json.getBoolean("init");
|
||||||
|
this.downloadImg = json.getBoolean("downloadImg");
|
||||||
|
this.path = json.getString("path");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private Site site = Site.me()
|
/**
|
||||||
.addHeader(headerKey,headerValue)
|
* 解析页面
|
||||||
.setCharset("utf8")//设置字符编码
|
*
|
||||||
.setTimeOut(2000)//设置超时时间
|
* @param page 页面
|
||||||
.setRetrySleepTime(200)//设置重试间隔时间
|
*/
|
||||||
.setCycleRetryTimes(6)//设置重试次数
|
|
||||||
.setSleepTime(1)//设置两个页面之间的间隔时间
|
|
||||||
;
|
|
||||||
|
|
||||||
//解析页面
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
|
//初始化配置 (放在此处而不放在run方法原因,每次执行该方法都是创建线程拿到当前的类属性不一致)
|
||||||
|
initParameter();
|
||||||
|
|
||||||
//解析返回的数据page,并且把解析的结果放到ResultItems中
|
//解析返回的数据page,并且把解析的结果放到ResultItems中
|
||||||
|
|
||||||
/*//第一种写法:css选择器
|
/*//第一种写法:css选择器
|
||||||
|
|
@ -136,9 +189,11 @@ public class _36wallpaperProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
|
||||||
//保存到磁盘
|
//保存到磁盘
|
||||||
if (link != null) {
|
if (downloadImg) {
|
||||||
String thisPath = path + File.separator + title;
|
if (link != null) {
|
||||||
downloadPicture(link, thisPath, pictureName + ".jpg");
|
String thisPath = path + File.separator + title;
|
||||||
|
downloadPicture(link, thisPath, pictureName + ".jpg");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//爬取图片标签
|
//爬取图片标签
|
||||||
|
|
@ -162,17 +217,17 @@ public class _36wallpaperProcessor implements PageProcessor {
|
||||||
}
|
}
|
||||||
|
|
||||||
//持久化 --使用Pipeline实现持久化了
|
//持久化 --使用Pipeline实现持久化了
|
||||||
//wallpaperService.saveBatch(wallpapers, 25);
|
wallpaperService.saveBatch(wallpapers, 25);
|
||||||
|
|
||||||
//暂时保存到内存中,后续实现Pipeline接口保存到数据库
|
//暂时保存到内存中,后续实现Pipeline接口保存到数据库--效率低下
|
||||||
page.putField("_36wallpaperData",wallpapers);
|
//page.putField("_36wallpaperData",wallpapers);
|
||||||
|
|
||||||
//循环次数存入redis中
|
//循环次数存入redis中
|
||||||
Integer count = redisService.getCacheObject(REPTILE_COUNT);
|
Integer count = redisService.getCacheObject(REPTILE_COUNT);
|
||||||
if (count == null) {
|
if (count == null) {
|
||||||
count=0;
|
count = 0;
|
||||||
}
|
}
|
||||||
redisService.setCacheObject(REPTILE_COUNT, count+1);
|
redisService.setCacheObject(REPTILE_COUNT, count + 1);
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
|
@ -185,7 +240,14 @@ public class _36wallpaperProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return site;
|
return Site.me()
|
||||||
|
.addHeader(headerKey, headerValue)
|
||||||
|
.setCharset("utf8")//设置字符编码
|
||||||
|
.setTimeOut(2000)//设置超时时间
|
||||||
|
.setRetrySleepTime(100)//设置重试间隔时间
|
||||||
|
.setCycleRetryTimes(10)//设置重试次数
|
||||||
|
.setSleepTime(1)//设置两个页面之间的间隔时间
|
||||||
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -194,12 +256,18 @@ public class _36wallpaperProcessor implements PageProcessor {
|
||||||
* @return 返回循环次数
|
* @return 返回循环次数
|
||||||
*/
|
*/
|
||||||
public Long run() {
|
public Long run() {
|
||||||
Spider.create(new _36wallpaperProcessor()).addUrl(_36_WALLPAPER_URL).thread(20)
|
//执行爬虫
|
||||||
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))
|
Spider.create(new _36wallpaperProcessor())
|
||||||
.addPipeline(wallpaperPipeline)
|
.addUrl(_36_WALLPAPER_URL)//设置爬取地址
|
||||||
.run();
|
.thread(30)//设置爬取线程数
|
||||||
|
.setScheduler(new QueueScheduler()
|
||||||
|
.setDuplicateRemover(new BloomFilterDuplicateRemover(110000)))//设置url去重过滤器
|
||||||
|
//.addPipeline(wallpaperPipeline)//设置爬取之后的数据操作
|
||||||
|
.run();//执行
|
||||||
|
|
||||||
|
//删除重复数据
|
||||||
|
int count = wallpaperService.deleteRepeatData();
|
||||||
|
log.info("36壁纸删除重复数据数:" + count);
|
||||||
|
|
||||||
//从redis中获取循环次数
|
//从redis中获取循环次数
|
||||||
Integer cache = redisService.getCacheObject(REPTILE_COUNT);
|
Integer cache = redisService.getCacheObject(REPTILE_COUNT);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue