2019-12-24 18:42:49 +08:00
|
|
|
|
<?php
|
|
|
|
|
/**
|
|
|
|
|
* easylist extend
|
|
|
|
|
*
|
|
|
|
|
* @file easylist-extend.php
|
|
|
|
|
* @date 2019-12-24
|
|
|
|
|
* @author gently
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
set_time_limit(0);
|
|
|
|
|
|
|
|
|
|
error_reporting(7);
|
|
|
|
|
|
|
|
|
|
define('START_TIME', microtime(true));
|
2020-01-23 12:49:20 +08:00
|
|
|
|
define('ROOT_DIR', dirname(__DIR__) . '/');
|
2020-01-17 13:17:27 +08:00
|
|
|
|
define('LIB_DIR', ROOT_DIR . 'lib/');
|
2019-12-24 18:42:49 +08:00
|
|
|
|
|
2020-01-17 13:17:27 +08:00
|
|
|
|
$black_domain_list = require_once LIB_DIR . 'black_domain_list.php';
|
|
|
|
|
require_once LIB_DIR . 'addressMaker.class.php';
|
2019-12-24 18:42:49 +08:00
|
|
|
|
define('WILDCARD_SRC', ROOT_DIR . 'origin-files/wildcard-src-easylist.txt');
|
|
|
|
|
define('WHITERULE_SRC', ROOT_DIR . 'origin-files/whiterule-src-easylist.txt');
|
|
|
|
|
|
|
|
|
|
$ARR_MERGED_WILD_LIST = array(
|
|
|
|
|
'ad*.udn.com' => null,
|
|
|
|
|
'*.mgr.consensu.org' => null,
|
|
|
|
|
'vs*.gzcu.u3.ucweb.com' => null,
|
|
|
|
|
'ad*.goforandroid.com' => null,
|
|
|
|
|
'bs*.9669.cn' => null,
|
|
|
|
|
'*dnserror*.wo.com.cn' => null,
|
2019-12-25 16:38:53 +08:00
|
|
|
|
'*mistat*.xiaomi.com' => null,
|
|
|
|
|
'affrh20*.com' => null,
|
|
|
|
|
'gsp*.baidu.com' => null,
|
|
|
|
|
'assoc-amazon.*' => null,
|
|
|
|
|
'clkservice*.youdao.com' => null,
|
|
|
|
|
'dsp*.youdao.com' => null,
|
|
|
|
|
'ad*.sina.com.cn' => null,
|
|
|
|
|
'sax*.sina.com.cn' => null,
|
|
|
|
|
'pussl*.com' => null,
|
|
|
|
|
'putrr*.com' => null,
|
|
|
|
|
'ad.*.360.cn' => null,
|
|
|
|
|
't*.a.market.xiaomi.com' => null,
|
|
|
|
|
'ad*.bigmir.net' => null,
|
|
|
|
|
'log*.molitv.cn' => null,
|
|
|
|
|
'adm*.autoimg.cn' => null,
|
|
|
|
|
'cloudservice*.kingsoft-office-service.com' => null,
|
|
|
|
|
'gg*.51cto.com' => null,
|
|
|
|
|
'log.*.hunantv.com' => null,
|
|
|
|
|
'*.log.hunantv.com' => null,
|
2019-12-26 19:09:09 +08:00
|
|
|
|
'iflyad.*.openstorage.cn' => null,
|
2019-12-27 10:16:02 +08:00
|
|
|
|
'*customstat*.51togic.com' => null,
|
|
|
|
|
'appcloud*.zhihu.com' => null,
|
2020-01-07 22:32:27 +08:00
|
|
|
|
'sf*-ttcdn-tos.pstatp.com' => null,
|
2020-01-08 18:34:38 +08:00
|
|
|
|
'ad*.molitv.cn' => null,
|
2020-01-09 13:17:34 +08:00
|
|
|
|
'ads*-adnow.com' => null,
|
|
|
|
|
'aeros*.tk' => null,
|
|
|
|
|
'analyzer*.fc2.com' => null,
|
2020-01-11 22:59:33 +08:00
|
|
|
|
'admicro*.vcmedia.vn' => null,
|
2020-01-18 22:44:03 +08:00
|
|
|
|
'xn--xhq9mt12cf5v.*' => null,
|
2020-01-19 16:58:33 +08:00
|
|
|
|
'freecontent.*' => null,
|
|
|
|
|
'hostingcloud.*' => null,
|
|
|
|
|
'jshosting.*' => null,
|
|
|
|
|
'flightzy.*' => null,
|
|
|
|
|
'sunnimiq*.cf' => null,
|
2020-01-18 22:44:03 +08:00
|
|
|
|
|
2019-12-24 18:42:49 +08:00
|
|
|
|
);
|
|
|
|
|
|
2020-01-16 22:28:19 +08:00
|
|
|
|
$ARR_REGEX_LIST = array(
|
|
|
|
|
'/^01daa\.[a-z]+\.com$/' => null,
|
|
|
|
|
'/^9377[a-z]{2}\.com$/' => null,
|
2020-01-30 11:14:59 +08:00
|
|
|
|
'/^[1-3]\.[0-9a-z\.\-]+\.(com|cn|net|org|cc|me)$/' => null,
|
2020-01-16 22:28:19 +08:00
|
|
|
|
// '/^a1\.[0-9a-z\.]+\.(com|cn|org|net|me)$/' => null,
|
2020-01-17 18:50:14 +08:00
|
|
|
|
'/^ad([0-9]|m|s)?\./' => null,
|
2020-01-17 18:45:41 +08:00
|
|
|
|
'/^affiliat(es|ion|e)\./' => null,
|
2020-01-16 22:28:19 +08:00
|
|
|
|
'/^afgr[0-9]{1,2}\.com$/' => null,
|
2020-01-17 18:45:41 +08:00
|
|
|
|
'/^analytics(\-|\.)/' => null,
|
|
|
|
|
'/^counter(\-|\.)/' => null,
|
|
|
|
|
'/^pixels?\./' => null,
|
2020-01-18 22:44:03 +08:00
|
|
|
|
'/^syma[a-z]\.cn$/' => null,
|
|
|
|
|
'/^widgets?\./' => null,
|
2020-01-19 15:09:14 +08:00
|
|
|
|
'/^(web)?stats?\./' => null,
|
2020-01-23 14:55:18 +08:00
|
|
|
|
'/^track(ing)?\./' => null,
|
2020-01-18 22:44:03 +08:00
|
|
|
|
'/^tongji\./' => null,
|
|
|
|
|
'/^toolbar\./' => null,
|
2020-01-19 11:57:37 +08:00
|
|
|
|
'/^adservice\.google\./' => null,
|
2020-01-16 22:28:19 +08:00
|
|
|
|
);
|
|
|
|
|
|
2020-01-29 22:26:22 +08:00
|
|
|
|
//对通配符匹配或正则匹配增加的额外赦免规则
|
2020-01-17 13:17:27 +08:00
|
|
|
|
$ARR_WHITE_RULE_LIST = array(
|
|
|
|
|
'@@||github.com^',
|
2020-01-30 10:14:19 +08:00
|
|
|
|
'@@||tongji.*kuwo.cn^',
|
2020-01-29 22:26:22 +08:00
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
//针对上游赦免规则anti-AD不予赦免的规则,即赦免名单的黑名单
|
|
|
|
|
$ARR_WHITE_RULE_BLK_LIST = array(
|
|
|
|
|
'@@||ads.nipr.ac.jp^' => null,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
//针对上游通配符规则中anti-AD不予采信的规则,即通配符黑名单
|
|
|
|
|
$ARR_WILD_BLK_LIST = array(
|
|
|
|
|
'cnt*rambler.ru' => null,
|
2020-01-17 13:17:27 +08:00
|
|
|
|
);
|
|
|
|
|
|
2019-12-24 18:42:49 +08:00
|
|
|
|
if(PHP_SAPI != 'cli'){
|
|
|
|
|
die('nothing.');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$src_file = '';
|
|
|
|
|
try{
|
|
|
|
|
$file = $argv[1];
|
|
|
|
|
$src_file = ROOT_DIR . $file;
|
|
|
|
|
}catch(Exception $e){
|
|
|
|
|
echo "get args failed.", $e->getMessage(), "\n";
|
|
|
|
|
die(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(empty($src_file) || !is_file($src_file)){
|
|
|
|
|
echo 'src_file:', $src_file, ' is not found.';
|
|
|
|
|
die(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(!is_file(WILDCARD_SRC) || !is_file(WHITERULE_SRC)){
|
|
|
|
|
echo 'key file is not found.';
|
|
|
|
|
die(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$src_fp = fopen($src_file, 'r');
|
|
|
|
|
$wild_fp = fopen(WILDCARD_SRC, 'r');
|
2019-12-24 21:59:28 +08:00
|
|
|
|
$new_fp = fopen($src_file . '.txt', 'w');
|
2019-12-24 18:42:49 +08:00
|
|
|
|
|
|
|
|
|
$wrote_wild = array();
|
|
|
|
|
$arr_wild_src = array();
|
|
|
|
|
|
|
|
|
|
while(!feof($wild_fp)){
|
|
|
|
|
$wild_row = fgets($wild_fp, 512);
|
|
|
|
|
if(empty($wild_row)){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if(!preg_match('/^\|\|?([\w\-\.\*]+?)\^(\$([^=]+?,)?(image|third-party|script)(,[^=]+)?)?$/', $wild_row, $matches)){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-01-29 22:26:22 +08:00
|
|
|
|
|
|
|
|
|
if(array_key_exists($matches[1], $ARR_WILD_BLK_LIST)){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$matched = false;
|
|
|
|
|
foreach($ARR_REGEX_LIST as $regex_str => $regex_row){
|
|
|
|
|
if(preg_match($regex_str, str_replace('*', '',$matches[1]))){
|
|
|
|
|
$matched = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if($matched){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2019-12-24 18:42:49 +08:00
|
|
|
|
$arr_wild_src[$matches[1]] = $wild_row;
|
|
|
|
|
}
|
|
|
|
|
fclose($wild_fp);
|
|
|
|
|
|
|
|
|
|
$arr_wild_src = array_merge($arr_wild_src, $ARR_MERGED_WILD_LIST);
|
|
|
|
|
|
|
|
|
|
while(!feof($src_fp)){
|
|
|
|
|
$row = fgets($src_fp, 512);
|
|
|
|
|
if(empty($row)){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(!preg_match('/^\|.+?/', $row)){
|
|
|
|
|
fwrite($new_fp, $row);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$matched = false;
|
2020-01-16 22:28:19 +08:00
|
|
|
|
foreach($ARR_REGEX_LIST as $regex_str => $regex_row){
|
|
|
|
|
if(preg_match($regex_str, substr(trim($row), 2, -1))){
|
|
|
|
|
$matched = true;
|
2020-01-17 13:17:27 +08:00
|
|
|
|
if(!array_key_exists($regex_str, $wrote_wild)){
|
2020-01-16 22:28:19 +08:00
|
|
|
|
fwrite($new_fp, "${regex_str}\n");
|
2020-01-17 13:17:27 +08:00
|
|
|
|
$wrote_wild[$regex_str] = 1;
|
2020-01-16 22:28:19 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if($matched){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2019-12-24 18:42:49 +08:00
|
|
|
|
|
2020-01-23 12:49:20 +08:00
|
|
|
|
foreach($arr_wild_src as $core_str => $wild_row){
|
2019-12-24 18:42:49 +08:00
|
|
|
|
$match_rule = str_replace('*', '.*', $core_str);
|
2020-01-29 22:26:22 +08:00
|
|
|
|
if(!array_key_exists($core_str, $wrote_wild)){
|
|
|
|
|
fwrite($new_fp, "||${core_str}^\n");
|
|
|
|
|
$wrote_wild[$core_str] = 1;
|
|
|
|
|
}
|
2019-12-24 18:42:49 +08:00
|
|
|
|
if(preg_match("/\|${match_rule}/", $row)){
|
|
|
|
|
$matched = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if($matched){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
fwrite($new_fp, $row);
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-24 21:59:28 +08:00
|
|
|
|
//按需写入白名单规则
|
2020-01-17 14:31:06 +08:00
|
|
|
|
$wrote_whitelist = array();
|
2020-01-17 13:17:27 +08:00
|
|
|
|
$whiterule = file(WHITERULE_SRC, FILE_SKIP_EMPTY_LINES);
|
|
|
|
|
$ARR_WHITE_RULE_LIST = array_merge($ARR_WHITE_RULE_LIST, $whiterule);
|
2020-01-23 12:49:20 +08:00
|
|
|
|
foreach($ARR_WHITE_RULE_LIST as $row){
|
2019-12-24 21:59:28 +08:00
|
|
|
|
if(empty($row) || $row{0} !== '@' || $row{1} !== '@'){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$matches = array();
|
|
|
|
|
if(!preg_match('/@@\|\|([0-9a-z\.\-\*]+?)\^/', $row, $matches)){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-01-29 22:26:22 +08:00
|
|
|
|
|
|
|
|
|
if(array_key_exists("@@||${matches[1]}^", $ARR_WHITE_RULE_BLK_LIST)){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-24 21:59:28 +08:00
|
|
|
|
foreach($wrote_wild as $core_str => $val){
|
2020-01-17 13:17:27 +08:00
|
|
|
|
if($core_str{0} === '/'){
|
|
|
|
|
$match_rule = $core_str;
|
|
|
|
|
}else{
|
|
|
|
|
$match_rule = str_replace('*', '.*', $core_str);
|
|
|
|
|
$match_rule = "/${match_rule}/";
|
|
|
|
|
}
|
2020-01-23 12:49:20 +08:00
|
|
|
|
if(preg_match($match_rule, $matches[1])){
|
2020-01-20 15:40:44 +08:00
|
|
|
|
$domain = addressMaker::extract_main_domain($matches[1]);
|
2020-01-17 13:17:27 +08:00
|
|
|
|
if(array_key_exists($domain, $black_domain_list) ||
|
|
|
|
|
(is_array($black_domain_list[$domain]) && in_array($matches[1], $black_domain_list[$domain]))
|
|
|
|
|
){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-01-17 14:31:06 +08:00
|
|
|
|
if(array_key_exists($matches[1], $wrote_whitelist)){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$wrote_whitelist[$matches[1]] = null;
|
2020-01-17 13:17:27 +08:00
|
|
|
|
fwrite($new_fp, "@@||${matches[1]}^\n");
|
2019-12-24 21:59:28 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2019-12-24 18:42:49 +08:00
|
|
|
|
}
|
2019-12-24 21:59:28 +08:00
|
|
|
|
|
|
|
|
|
fclose($src_fp);
|
|
|
|
|
fclose($new_fp);
|
2020-01-09 13:17:34 +08:00
|
|
|
|
rename($src_file . '.txt', $src_file);
|
2019-12-24 21:59:28 +08:00
|
|
|
|
echo 'Time cost:', microtime(true) - START_TIME, "s, at ", date('m-d H:i:s'), "\n";
|