parent
899de00d9f
commit
cf8b312c88
|
|
@ -1,9 +1,9 @@
|
||||||
package com.xjs.consts;
|
package com.xjs.consts;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* api预警处理常量
|
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
* @since 2022-01-07
|
* @desc api预警处理常量
|
||||||
|
* @create 2022-01-07
|
||||||
*/
|
*/
|
||||||
public class ApiWarnHandleConst {
|
public class ApiWarnHandleConst {
|
||||||
//已处理
|
//已处理
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
package com.xjs.consts;
|
package com.xjs.consts;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 各个数据平台常量类
|
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
* @since 2021-12-28
|
* @desc 各个数据平台常量类
|
||||||
|
* @create 2021-12-28
|
||||||
*/
|
*/
|
||||||
public class CopyWritingConst {
|
public class CopyWritingConst {
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
package com.xjs.consts;
|
package com.xjs.consts;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 英语单词常量类
|
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
* @since 2021-12-31
|
* @desc 英语单词常量类
|
||||||
|
* @create 2021-12-31
|
||||||
*/
|
*/
|
||||||
public class EnglishWordConst {
|
public class EnglishWordConst {
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,9 +27,4 @@ public class RegexConst {
|
||||||
* ip地址v4、v6正则
|
* ip地址v4、v6正则
|
||||||
*/
|
*/
|
||||||
public static final String IP_REGEX ="^((2[0-4]\\d|25[0-5]|[01]?\\d\\d?)\\.){3}(2[0-4]\\d|25[0-5]|[01]?\\d\\d?)$";
|
public static final String IP_REGEX ="^((2[0-4]\\d|25[0-5]|[01]?\\d\\d?)\\.){3}(2[0-4]\\d|25[0-5]|[01]?\\d\\d?)$";
|
||||||
|
|
||||||
/**
|
|
||||||
* 数字校验正则
|
|
||||||
*/
|
|
||||||
public static final String NUMBER_REGEX= "[0-9]*";
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,24 +0,0 @@
|
||||||
package com.xjs.consts;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 爬虫网址常量类
|
|
||||||
* @author xiejs
|
|
||||||
* @since 2022-02-16
|
|
||||||
*/
|
|
||||||
public class ReptileUrlConst {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 新浪新闻网站
|
|
||||||
*/
|
|
||||||
public static final String SINA_NEWS_URL = "https://news.sina.com.cn/";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 文案网网址
|
|
||||||
*/
|
|
||||||
public static final String COPY_WRITING_NETWORK_URL = "https://www.wenanwang.com/";
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
package com.xjs.consts;
|
package com.xjs.consts;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 请求是否成功常量
|
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
* @since 2021-12-26
|
* @desc 请求是否成功常量
|
||||||
|
* @create 2021-12-26
|
||||||
*/
|
*/
|
||||||
public class ReqConst {
|
public class ReqConst {
|
||||||
public static final Integer SUCCESS = 1;
|
public static final Integer SUCCESS = 1;
|
||||||
|
|
|
||||||
|
|
@ -18,9 +18,6 @@ import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import static com.xjs.consts.RegexConst.NUMBER_REGEX;
|
|
||||||
import static com.xjs.consts.ReptileUrlConst.COPY_WRITING_NETWORK_URL;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 文案网爬虫任务 url:https://www.wenanwang.com/
|
* 文案网爬虫任务 url:https://www.wenanwang.com/
|
||||||
*
|
*
|
||||||
|
|
@ -37,21 +34,23 @@ public class CopyWritingNetworkTask {
|
||||||
private CopyWritingNetworkService copyWritingNetworkService;
|
private CopyWritingNetworkService copyWritingNetworkService;
|
||||||
|
|
||||||
|
|
||||||
private static final Pattern pattern = Pattern.compile(NUMBER_REGEX);
|
public static final String URL = "https://www.wenanwang.com/";
|
||||||
|
|
||||||
@Scheduled(fixedDelay = 1000 * 5 * 60 * 10)
|
private static Pattern pattern = Pattern.compile("[0-9]*");
|
||||||
|
|
||||||
|
@Scheduled(fixedDelay = 1000 * 5)
|
||||||
public void reptileCopyWriting() {
|
public void reptileCopyWriting() {
|
||||||
try {
|
try {
|
||||||
String html = httpUtils.doGetHtml(COPY_WRITING_NETWORK_URL);
|
String html = httpUtils.doGetHtml(URL);
|
||||||
|
|
||||||
Document document = Jsoup.parse(html);
|
Document document = Jsoup.parse(html);
|
||||||
|
|
||||||
this.parseHtmlGetUrl(document);
|
this.parseHtmlGetUrl(document);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
} finally {
|
}finally {
|
||||||
int i = copyWritingNetworkService.deleteRepeatData();
|
int i = copyWritingNetworkService.deleteRepeatData();
|
||||||
log.info("删除文案网数据重复数:" + i);
|
log.info("删除文案网数据重复数:"+i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -93,7 +92,7 @@ public class CopyWritingNetworkTask {
|
||||||
for (Element element : a) {
|
for (Element element : a) {
|
||||||
|
|
||||||
String href = element.attr("href");
|
String href = element.attr("href");
|
||||||
String newUrl = COPY_WRITING_NETWORK_URL + href;
|
String newUrl = URL + href;
|
||||||
|
|
||||||
String cw = httpUtils.doGetHtml(newUrl);
|
String cw = httpUtils.doGetHtml(newUrl);
|
||||||
Document cwDocument = Jsoup.parse(cw);
|
Document cwDocument = Jsoup.parse(cw);
|
||||||
|
|
|
||||||
|
|
@ -16,8 +16,6 @@ import org.springframework.stereotype.Component;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static com.xjs.consts.ReptileUrlConst.SINA_NEWS_URL;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新浪新闻爬虫任务
|
* 新浪新闻爬虫任务
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
|
|
@ -32,12 +30,11 @@ public class SinaNewsTask {
|
||||||
@Autowired
|
@Autowired
|
||||||
private SinaNewsService sinaNewsService;
|
private SinaNewsService sinaNewsService;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void reptileSinaNews() {
|
public void reptileSinaNews() {
|
||||||
try {
|
try {
|
||||||
|
String url = "https://news.sina.com.cn/";
|
||||||
|
|
||||||
String html = httpUtils.doGetHtml(SINA_NEWS_URL);
|
String html = httpUtils.doGetHtml(url);
|
||||||
|
|
||||||
Document document = Jsoup.parse(html);
|
Document document = Jsoup.parse(html);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue