Skip to content

Commit

Permalink
Merge branch 'SukkaW:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
FYLSen authored Sep 2, 2024
2 parents 4326c8c + c96b35b commit b670ca0
Show file tree
Hide file tree
Showing 13 changed files with 338 additions and 414 deletions.
5 changes: 5 additions & 0 deletions Build/constants/loose-tldts-opt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,8 @@ export const looseTldtsOpt: Parameters<typeof tldts.getSubdomain>[1] = {
detectIp: false,
mixedInputs: false
};

export const loosTldOptWithPrivateDomains: Parameters<typeof tldts.getSubdomain>[1] = {
...looseTldtsOpt,
allowPrivateDomains: true
};
6 changes: 4 additions & 2 deletions Build/constants/reject-data-source.ts
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,10 @@ export const PREDEFINED_WHITELIST = [
'm.stripe.com', // EasyPrivacy only blocks m.stripe.com wwith $third-party,
// yet stupid AdGuardDNSFilter blocks all of it. Stupid AdGuard
'.w3s.link', // stupid phishing.army, introduce both "*.ipfs.w3s.link" and ".w3s.link" to the block list
'ipfs.io', // ipfs.io was blocked by DigitalSide Threat-Intel - OSINT Hub
'.r2.dev', // Despite 5000+ r2 instances used for phishing, yet cloudflare refuse to do anything. we have no choice but whitelist this.
'mlsend.com', // Fuck Peter Lowe Hosts
'ab.chatgpt.com' // EasyPrivacy blocks this
'ab.chatgpt.com', // EasyPrivacy blocks this
'jnn-pa.googleapis.com', // ad-wars
'imasdk.googleapis.com', // ad-wars
'.l.qq.com' // ad-wars
];
2 changes: 1 addition & 1 deletion Build/lib/cache-filesystem.ts
Original file line number Diff line number Diff line change
Expand Up @@ -217,5 +217,5 @@ export const deserializeArray = (str: string) => str.split(separator);

export const createCacheKey = (filename: string) => {
const fileHash = stringHash(fs.readFileSync(filename, 'utf-8'));
return (key: string) => key + '$' + fileHash;
return (key: string) => key + '$' + fileHash + '$';
};
5 changes: 4 additions & 1 deletion Build/lib/get-phishing-domains.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import { calcDomainAbuseScore } from './get-phishing-domains';

describe('sortDomains', () => {
it('nmdj.pl', () => {
console.log(calcDomainAbuseScore('.01462ccca801fed55370d79231c876e5.nmdj.pl', '.01462ccca801fed55370d79231c876e5', false));
console.log(calcDomainAbuseScore('zrz.q435.one', 'zrz'));
console.log(calcDomainAbuseScore('z1.finprotect.click', 'z1'));
console.log(calcDomainAbuseScore('accountsettingaddrecoverymanagesiteupdatebillingreview.village.softcare.co.in', 'accountsettingaddrecoverymanagesiteupdatebillingreview.village'));
console.log(calcDomainAbuseScore('allegrolokalnie.pl-oferta51328742.pl', 'allegrolokalnie'));
});
});
120 changes: 68 additions & 52 deletions Build/lib/get-phishing-domains.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ import * as tldts from 'tldts-experimental';
import type { Span } from '../trace';
import { appendArrayInPlaceCurried } from './append-array-in-place';
import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source';
import { looseTldtsOpt } from '../constants/loose-tldts-opt';
import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt';
import picocolors from 'picocolors';
import createKeywordFilter from './aho-corasick';
import { createCacheKey } from './cache-filesystem';

const BLACK_TLD = new Set([
'accountant',
Expand Down Expand Up @@ -77,6 +78,7 @@ const BLACK_TLD = new Set([
'space',
'store',
'stream',
'surf',
'tech',
'tk',
'tokyo',
Expand All @@ -95,13 +97,13 @@ const BLACK_TLD = new Set([

const WHITELIST_MAIN_DOMAINS = new Set([
'w3s.link', // ipfs gateway
'dweb.link', // ipfs gateway
'nftstorage.link', // ipfs gateway
// 'dweb.link', // ipfs gateway
// 'nftstorage.link', // ipfs gateway
'fleek.cool', // ipfs gateway
'business.site', // Drag'n'Drop site building platform
'page.link', // Firebase URL Shortener
'notion.site',
'vercel.app',
// 'notion.site',
// 'vercel.app',
'gitbook.io'
]);

Expand All @@ -120,27 +122,34 @@ const sensitiveKeywords = createKeywordFilter([
'virus-',
'icloud-',
'apple-',
'www.apple.',
'-coinbase',
'coinbase-'
'coinbase-',
'lcloud.',
'lcloud-'
]);
const lowKeywords = createKeywordFilter([
'-co-jp',
'customer.',
'customer-',
'.www-'
'.www-',
'instagram'
]);

const cacheKey = createCacheKey(__filename);

export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
const domainArr: string[] = [];

(await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry))))
(await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry, cacheKey))))
.forEach(appendArrayInPlaceCurried(domainArr));

return domainArr;
});

const domainCountMap: Record<string, number> = {};
const domainScoreMap: Record<string, number> = {};

span.traceChildSync('process phishing domain set', () => {
for (let i = 0, len = domainArr.length; i < len; i++) {
Expand All @@ -149,8 +158,13 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
const {
publicSuffix: tld,
domain: apexDomain,
subdomain
} = tldts.parse(line, looseTldtsOpt);
subdomain,
isPrivate
} = tldts.parse(line, loosTldOptWithPrivateDomains);

if (isPrivate) {
continue;
}

if (!tld) {
console.log(picocolors.yellow('[phishing domains] E0001'), 'missing tld', { line, tld });
Expand All @@ -161,67 +175,69 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
continue;
}

let sensitiveKeywordsHit: boolean | null = null;
if (tld.length < 6 && !tld.includes('.') && !BLACK_TLD.has(tld) && !(sensitiveKeywordsHit = sensitiveKeywords(line))) continue;

domainCountMap[apexDomain] ||= 0;
domainCountMap[apexDomain] += calcDomainAbuseScore(line, subdomain, sensitiveKeywordsHit);
domainCountMap[apexDomain] += 1;

if (!('apexDomain' in domainScoreMap)) {
domainScoreMap[apexDomain] = 0;
if (BLACK_TLD.has(tld)) {
domainScoreMap[apexDomain] += 4;
} else if (tld.length > 6) {
domainScoreMap[apexDomain] += 2;
}
}
domainScoreMap[apexDomain] += calcDomainAbuseScore(line, subdomain);
}
});

for (const domain in domainCountMap) {
if (domainCountMap[domain] >= 10 && !WHITELIST_MAIN_DOMAINS.has(domain)) {
for (const domain in domainScoreMap) {
if (
!WHITELIST_MAIN_DOMAINS.has(domain)
&& (
domainScoreMap[domain] >= 12
|| (domainScoreMap[domain] > 6 && domainCountMap[domain] >= 3)
)
) {
console.log({ domain });
domainArr.push(`.${domain}`);
}
}

return domainArr;
});

export function calcDomainAbuseScore(line: string, subdomain: string | null, sensitiveKeywordsHit: boolean | null) {
let weight = 1;
export function calcDomainAbuseScore(line: string, subdomain: string | null) {
let weight = 0;

const hitLowKeywords = lowKeywords(line);
if (subdomain) {
const hitLowKeywords = lowKeywords(subdomain);
const sensitiveKeywordsHit = sensitiveKeywords(subdomain);

sensitiveKeywordsHit ??= sensitiveKeywords(line);
if (sensitiveKeywordsHit) {
weight += 4;
if (hitLowKeywords) {
weight += 5;
if (sensitiveKeywordsHit) {
weight += 8;
if (hitLowKeywords) {
weight += 4;
}
} else if (hitLowKeywords) {
weight += 1;
}
} else if (hitLowKeywords) {
weight += 0.5;
}

const lineLen = line.length;

if (lineLen > 19) {
// Add more weight if the domain is long enough
if (lineLen > 44) {
weight += 3.5;
} else if (lineLen > 34) {
weight += 2.5;
} else if (lineLen > 29) {
weight += 1.5;
} else if (lineLen > 24) {
weight += 0.75;
} else {
weight += 0.25;
}
}
const subdomainLength = subdomain.length;

if (subdomain) {
if (subdomain.length > 40) {
weight += 3;
} else if (subdomain.length > 30) {
weight += 1.5;
} else if (subdomain.length > 20) {
if (subdomainLength > 40) {
weight += 8;
} else if (subdomainLength > 30) {
weight += 4;
} else if (subdomainLength > 20) {
weight += 2;
} else if (subdomainLength > 10) {
weight += 1;
} else if (subdomain.length > 10) {
weight += 0.1;
}
if (subdomain.slice(1).includes('.')) {
weight += 1;
weight += 2;
if (subdomain.includes('www.')) {
weight += 2;
}
}
}

Expand Down
6 changes: 4 additions & 2 deletions Build/lib/parse-filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
let foundDebugDomain = false;
const temporaryBypass = typeof DEBUG_DOMAIN_TO_FIND === 'string';

const identity = <T>(x: T) => x;

const domainListLineCb = (l: string, set: string[], includeAllSubDomain: boolean, meta: string) => {
let line = processLine(l);
if (!line) return;
Expand Down Expand Up @@ -44,9 +46,9 @@ const domainListLineCb = (l: string, set: string[], includeAllSubDomain: boolean

const cacheKey = createCacheKey(__filename);

export function processDomainLists(span: Span, domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) {
export function processDomainLists(span: Span, domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null, extraCacheKey: (input: string) => string = identity) {
return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply(
cacheKey(domainListsUrl),
extraCacheKey(cacheKey(domainListsUrl)),
async () => {
const domainSets: string[] = [];

Expand Down
24 changes: 14 additions & 10 deletions Source/domainset/cdn.conf
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,14 @@ consentdeliveryfd.azurefd.net
.rollupjs.org
.pnpm.io
telemetry.nextjs.org
telemetry.vercel.com
npmgraph.js.org
bundlephobia.com
pkg-size.dev
vanilla-extract.style
ipe-plugins.js.org
momentjs.com
typescript-eslint.io

# Badge Server
api.bintray.com
Expand Down Expand Up @@ -425,6 +427,7 @@ cdn.graph.office.net
res.cdn.office.net
res-1.cdn.office.net
res-geo.cdn.office.net
statics.teams.cdn.office.net
static2.sharepointonline.com
prod.msocdn.com
spoprod-a.akamaihd.net
Expand All @@ -451,6 +454,7 @@ xpaywalletcdn.azureedge.net
edge-consumer-static.azureedge.net
claritystatic.azureedge.net
.static.microsoft
.skypeassets.com
# Appcenter
mobilecenter.azureedge.net
# Microsoft Cookie Conscent
Expand Down Expand Up @@ -587,18 +591,11 @@ s3media.squarespace.com
.f-droid.org

# >> StackBlitz
.c.staticblitz.com
.w.staticblitz.com
.t.staticblitz.com
.npm.staticblitz.com
.v-corp.staticblitz.com
.w-corp.staticblitz.com
ghavatars.staticblitz.com
nr.staticblitz.com
.w-credentialless.staticblitz.com
.staticblitz.com
.stackblitz.io
.local.webcontainer.io
.w-credentialless-staticblitz.com
.w-corp-staticblitz.com

# >> hCaptcha
newassets.hcaptcha.com
Expand Down Expand Up @@ -906,7 +903,7 @@ widgets.tree-nation.com
info.knak.com
apps.profitihub.com
app.cookieyes.com
cdn-cookieyes.com
.cdn-cookieyes.com
static.srcspot.com
vice-prod.sdiapi.com
sdk.snapkit.com
Expand Down Expand Up @@ -945,6 +942,8 @@ cdn1.stamped.io
app.backinstock.org
assets.videowise.com
app-static.turtl.co
.quip-cdn.com
.quip-marketing.com

cdn.transcend.io
.transcend-cdn.com
Expand Down Expand Up @@ -1484,6 +1483,8 @@ denali-static.grammarly.com
assets.extension.grammarly.com
assets.grammarly.com
static.grammarly.com
# this domain is beind cloudfront, so put this to cdn hosts
config.extension.grammarly.com
# okta
.oktacdn.com
# OpenAI
Expand Down Expand Up @@ -2315,6 +2316,7 @@ cdn.circle.so
.eu-browse.startpage.com
.eu-proxy.startpage.com
.proxy.startpage.com
.proxy1.startpage.com
.proxy2.startpage.com
assets.matters.news
assets-next.matters.news
Expand Down Expand Up @@ -2767,3 +2769,5 @@ images.bfmtv.com
.assets.adidas.com
.glass.adidas.com
assets.context.ly
.kirafile.com
cms-assets.nodecraft.com
3 changes: 3 additions & 0 deletions Source/domainset/reject_sukka.conf
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ optimus-ads.amap.com
.sitebeat.crazydomains.com
# online d'n'd website builder (https://www.infonline.de/)
.apps.iocnt.de
# codeanywhere is a heaven of abuse
.codeanyapp.com

# >> Qihoo 360
hot.m.shouji.360tpcdn.com
Expand Down Expand Up @@ -856,6 +858,7 @@ tag.clearbitscripts.com

.femetrics.grammarly.io
.f-log-extension.grammarly.io
.data.grammarly.com
.femetrics.qagr.io

cname.ebis.ne.jp
Expand Down
Loading

0 comments on commit b670ca0

Please sign in to comment.