Skip to content

Commit

Permalink
Add DOC
Browse files Browse the repository at this point in the history
  • Loading branch information
ishii-norimi committed Jun 23, 2024
1 parent 6243bc9 commit f2bbcfe
Show file tree
Hide file tree
Showing 6 changed files with 348 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ for (let i = 0; i < n; i++) {

| task | model |
| ---- | ----- |
| clustering | (Soft / Kernel / Genetic / Weighted / Bisecting) k-means, k-means++, k-medois, k-medians, x-means, G-means, LBG, ISODATA, Fuzzy c-means, Possibilistic c-means, k-harmonic means, MacQueen, Hartigan-Wong, Elkan, Hamelry, Drake, Yinyang, Agglomerative (complete linkage, single linkage, group average, Ward's, centroid, weighted average, median), DIANA, Monothetic, Mutual kNN, Mean shift, DBSCAN, OPTICS, DTSCAN, HDBSCAN, DENCLUE, DBCLASD, BRIDGE, CLUES, PAM, CLARA, CLARANS, BIRCH, CURE, ROCK, C2P, PLSA, Latent dirichlet allocation, GMM, VBGMM, Affinity propagation, Spectral clustering, Mountain, (Growing) SOM, GTM, (Growing) Neural gas, Growing cell structures, LVQ, ART, SVC, CAST, CHAMELEON, COLL, CLIQUE, PROCLUS, ORCLUS, FINDIT, NMF, Autoencoder |
| clustering | (Soft / Kernel / Genetic / Weighted / Bisecting) k-means, k-means++, k-medois, k-medians, x-means, G-means, LBG, ISODATA, Fuzzy c-means, Possibilistic c-means, k-harmonic means, MacQueen, Hartigan-Wong, Elkan, Hamelry, Drake, Yinyang, Agglomerative (complete linkage, single linkage, group average, Ward's, centroid, weighted average, median), DIANA, Monothetic, Mutual kNN, Mean shift, DBSCAN, OPTICS, DTSCAN, HDBSCAN, DENCLUE, DBCLASD, BRIDGE, CLUES, PAM, CLARA, CLARANS, BIRCH, CURE, ROCK, C2P, PLSA, Latent dirichlet allocation, GMM, VBGMM, Affinity propagation, Spectral clustering, Mountain, (Growing) SOM, GTM, (Growing) Neural gas, Growing cell structures, LVQ, ART, SVC, CAST, CHAMELEON, COLL, CLIQUE, PROCLUS, ORCLUS, FINDIT, DOC, FastDOC, NMF, Autoencoder |
| classification | (Fisher's) Linear discriminant, Quadratic discriminant, Mixture discriminant, Least squares, (Multiclass / Kernel) Ridge, (Complement / Negation / Universal-set / Selective) Naive Bayes (gaussian), AODE, (Fuzzy / Weighted) k-nearest neighbor, Radius neighbor, Nearest centroid, ENN, ENaN, NNBCA, ADAMENN, DANN, IKNN, Decision tree, Random forest, Extra trees, GBDT, XGBoost, ALMA, (Aggressive) ROMMA, (Bounded) Online gradient descent, (Budgeted online) Passive aggressive, RLS, (Selective-sampling) Second order perceptron, AROW, NAROW, Confidence weighted, CELLIP, IELLIP, Normal herd, Stoptron, (Kernelized) Pegasos, MIRA, Forgetron, Projectron, Projectron++, Banditron, Ballseptron, (Multiclass) BSGD, ILK, SILK, (Multinomial) Logistic regression, (Multinomial) Probit, SVM, Gaussian process, HMM, CRF, Bayesian Network, LVQ, (Average / Multiclass / Voted / Kernelized / Selective-sampling / Margin / Shifting / Budget / Tighter / Tightest) Perceptron, PAUM, RBP, ADALINE, MADALINE, MLP, ELM, LMNN |
| semi-supervised classification | k-nearest neighbor, Radius neighbor, Label propagation, Label spreading, k-means, GMM, S3VM, Ladder network |
| regression | Least squares, Ridge, Lasso, Elastic net, RLS, Bayesian linear, Poisson, Least absolute deviations, Huber, Tukey, Least trimmed squares, Least median squares, Lp norm linear, SMA, Deming, Segmented, LOWESS, LOESS, spline, Naive Bayes, Gaussian process, Principal components, Partial least squares, Projection pursuit, Quantile regression, k-nearest neighbor, Radius neighbor, IDW, Nadaraya Watson, Priestley Chao, Gasser Muller, RBF Network, RVM, Decision tree, Random forest, Extra trees, GBDT, XGBoost, SVR, MLP, ELM, GMR, Isotonic, Ramer Douglas Peucker, Theil-Sen, Passing-Bablok, Repeated median |
Expand Down
1 change: 1 addition & 0 deletions js/model_selector.js
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ const AIMethods = [
{ value: 'proclus', title: 'PROCLUS' },
{ value: 'orclus', title: 'ORCLUS' },
{ value: 'findit', title: 'FINDIT' },
{ value: 'doc', title: 'DOC / FastDOC' },
{ value: 'plsa', title: 'PLSA' },
{ value: 'latent_dirichlet_allocation', title: 'Latent Dirichlet Allocation' },
{ value: 'nmf', title: 'NMF' },
Expand Down
39 changes: 39 additions & 0 deletions js/view/doc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { DOC, FastDOC } from '../../lib/model/doc.js'
import Controller from '../controller.js'

export default function (platform) {
platform.setting.ml.usage = 'Click and add data point. Then, click "Fit" button.'
platform.setting.ml.reference = {
author: 'C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali',
title: 'A monte carlo algorithm for fast projective clustering',
year: 2002,
}
const controller = new Controller(platform)

const fitModel = () => {
let model = null
if (type.value === 'DOC') {
model = new DOC(alpha.value, beta.value, w.value)
} else {
model = new DOC(alpha.value, beta.value, w.value, maxiter.value, d0.value)
}

model.fit(platform.trainInput)
const pred = model.predict().map(v => v + 1)
platform.trainResult = pred
}

const type = controller.select(['DOC', 'FastDOC']).on('change', () => {
felm.element.style.display = type.value === 'DOC' ? 'none' : null
})
const alpha = controller.input.number({ label: ' alpha ', min: 0, max: 1, step: 0.01, value: 0.1 })
const beta = controller.input.number({ label: ' beta ', min: 0, max: 0.5, step: 0.01, value: 0.25 })
const w = controller.input.number({ label: ' width ', min: 0, max: 1000, step: 0.1, value: 0.1 })

const felm = controller.span()
felm.element.style.display = 'none'
const maxiter = felm.input.number({ label: ' maxiter ', min: 1, max: 1000000, value: 100 })
const d0 = felm.input.number({ label: ' d0 ', min: 1, max: 100, value: 2 })

controller.input.button('Fit').on('click', fitModel)
}
203 changes: 203 additions & 0 deletions lib/model/doc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/**
* Density-based Optimal projective Clustering
*/
export class DOC {
// A monte carlo algorithm for fast projective clustering
// https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=f7a389eb1742d16cf09fc0d631cc0d1e97d49dda
/**
* @param {number} alpha Dense scale
* @param {number} beta Balanced value
* @param {number} w Width of cluster
*/
constructor(alpha, beta, w) {
this._alpha = alpha
this._beta = beta
this._w = w
this._p = []
this._d = []

this._mu = (a, b) => a * (1 / this._beta) ** b
}

_select(n, k) {
const idx = []
for (let i = 0; i < k; i++) {
idx.push(Math.floor(Math.random() * (n - i)))
}
for (let i = idx.length - 1; i >= 0; i--) {
for (let j = idx.length - 1; j > i; j--) {
if (idx[i] <= idx[j]) {
idx[j]++
}
}
}
return idx
}

/**
* Fit model.
* @param {Array<Array<number>>} datas Sample data
*/
fit(datas) {
const n = datas.length
const d = datas[0].length
const r = Math.min(n, Math.ceil(Math.log(2 * d) / Math.log(1 / (2 * this._beta))))
const m = (2 / this._alpha) ** r * Math.log(4)
let best_mu = 0
let opt_cluster = []
let opt_dim = []

for (let i = 0; i < 2 / this._alpha; i++) {
const p = datas[Math.floor(Math.random() * n)]
for (let j = 0; j < m; j++) {
const xi = this._select(n, r)
const l = []
const h = []

const td = []
for (let k = 0; k < d; k++) {
if (xi.every(t => Math.abs(datas[t][k] - p[k]) <= this._w)) {
td.push(k)
l.push(p[k] - this._w)
h.push(p[k] + this._w)
} else {
l.push(-Infinity)
h.push(Infinity)
}
}
const c = []
for (let t = 0; t < n; t++) {
if (datas[t].every((v, k) => l[k] <= v && v <= h[k])) {
c.push(t)
}
}
if (c.length < this._alpha * n) {
continue
}
const mu = this._mu(c.length, td.length)
if (best_mu < mu) {
best_mu = mu
opt_cluster = c
opt_dim = td
}
}
}

const p = Array(n).fill(-1)
for (let i = 0; i < opt_cluster.length; i++) {
p[opt_cluster[i]] = 0
}
this._p = p
this._d = opt_dim
}

/**
* Returns predicted categories.
* @returns {number[]} Predicted values
*/
predict() {
return this._p
}
}

/**
* Fast Density-based Optimal projective Clustering
*/
export class FastDOC {
// A monte carlo algorithm for fast projective clustering
// https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=f7a389eb1742d16cf09fc0d631cc0d1e97d49dda
/**
* @param {number} alpha Dense scale
* @param {number} beta Balanced value
* @param {number} w Width of cluster
* @param {number} maxiter Maximum inner iteration
* @param {number} d0 Threshold of selected dimension count
*/
constructor(alpha, beta, w, maxiter, d0) {
this._alpha = alpha
this._beta = beta
this._w = w
this._maxiter = maxiter
this._d0 = d0
this._p = []
this._d = []
}

_select(n, k) {
const idx = []
for (let i = 0; i < k; i++) {
idx.push(Math.floor(Math.random() * (n - i)))
}
for (let i = idx.length - 1; i >= 0; i--) {
for (let j = idx.length - 1; j > i; j--) {
if (idx[i] <= idx[j]) {
idx[j]++
}
}
}
return idx
}

/**
* Fit model.
* @param {Array<Array<number>>} datas Sample data
*/
fit(datas) {
const n = datas.length
const d = datas[0].length
const r = Math.min(n, Math.ceil(Math.log(2 * d) / Math.log(1 / (2 * this._beta))))
const m = Math.min(this._maxiter, (2 / this._alpha) ** r * Math.log(4))

let opt_dim = []
let opt_p = null

for (let i = 0; i < 2 / this._alpha; i++) {
const p = datas[Math.floor(Math.random() * n)]
for (let j = 0; j < m; j++) {
const xi = this._select(n, r)

const td = []
for (let k = 0; k < d; k++) {
if (xi.every(t => Math.abs(datas[t][k] - p[k]) <= this._w)) {
td.push(k)
}
}
if (td.length >= opt_dim.length) {
opt_dim = td
opt_p = p
}
if (opt_dim.length >= this._d0) {
break
}
}
if (opt_dim.length >= this._d0) {
break
}
}
const l = Array.from({ length: d }, () => -Infinity)
const h = Array.from({ length: d }, () => Infinity)

for (let k = 0; k < opt_dim.length; k++) {
l[opt_dim[k]] = opt_p[opt_dim[k]] - this._w
h[opt_dim[k]] = opt_p[opt_dim[k]] + this._w
}

const p = Array(n).fill(-1)
for (let t = 0; t < n; t++) {
if (datas[t].every((v, k) => l[k] <= v && v <= h[k])) {
p[t] = 0
}
}

this._p = p
this._d = opt_dim
}

/**
* Returns predicted categories.
* @returns {number[]} Predicted values
*/
predict() {
return this._p
}
}
49 changes: 49 additions & 0 deletions tests/gui/view/doc.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import { getPage } from '../helper/browser'

describe('clustering', () => {
/** @type {Awaited<ReturnType<getPage>>} */
let page
beforeEach(async () => {
page = await getPage()
const taskSelectBox = await page.waitForSelector('#ml_selector dl:first-child dd:nth-child(5) select')
await taskSelectBox.selectOption('CT')
const modelSelectBox = await page.waitForSelector('#ml_selector .model_selection #mlDisp')
await modelSelectBox.selectOption('doc')
})

afterEach(async () => {
await page?.close()
})

test('initialize', async () => {
const methodMenu = await page.waitForSelector('#ml_selector #method_menu')
const buttons = await methodMenu.waitForSelector('.buttons')

const type = await buttons.waitForSelector('select:nth-of-type(1)')
await expect((await type.getProperty('value')).jsonValue()).resolves.toBe('DOC')
const alpha = await buttons.waitForSelector('input:nth-of-type(1)')
await expect(alpha.getAttribute('value')).resolves.toBe('0.1')
const beta = await buttons.waitForSelector('input:nth-of-type(2)')
await expect(beta.getAttribute('value')).resolves.toBe('0.25')
const w = await buttons.waitForSelector('input:nth-of-type(3)')
await expect(w.getAttribute('value')).resolves.toBe('0.1')
})

test('learn', async () => {
const methodMenu = await page.waitForSelector('#ml_selector #method_menu')
const buttons = await methodMenu.waitForSelector('.buttons')

const fitButton = await buttons.waitForSelector('input[value=Fit]')
await fitButton.evaluate(el => el.click())

const svg = await page.waitForSelector('#plot-area svg')
await svg.waitForSelector('.datas circle')
const circles = await svg.$$('.datas circle')
const colors = new Set()
for (const circle of circles) {
const fill = await circle.evaluate(el => el.getAttribute('fill'))
colors.add(fill)
}
expect(colors.size).toBe(2)
})
})
55 changes: 55 additions & 0 deletions tests/lib/model/doc.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import { jest } from '@jest/globals'
jest.retryTimes(3)

import Matrix from '../../../lib/util/matrix.js'
import { DOC, FastDOC } from '../../../lib/model/doc.js'

import { randIndex } from '../../../lib/evaluate/clustering.js'

describe('doc', () => {
test('small alpha', () => {
const model = new DOC(0.1, 0.2, 1.0)
const n = 100
const x = Matrix.concat(Matrix.randn(n, 3, [0, 5, 0], 0.1), Matrix.randn(n, 3, [10, 5, 10], 0.1)).toArray()

model.fit(x)
const y = model.predict()
expect(y).toHaveLength(x.length)

const t = []
for (let i = 0; i < x.length; i++) {
t[i] = Math.floor(i / n)
}
const ri = randIndex(y, t)
expect(ri).toBeGreaterThan(0.9)
})

test('big alpha', () => {
const model = new DOC(0.9, 0.2, 1.0)
const n = 50
const x = Matrix.concat(Matrix.randn(n, 3, 0, 0.1), Matrix.randn(n, 3, 10, 0.1)).toArray()

model.fit(x)
const y = model.predict()
expect(y).toHaveLength(x.length)
})
})

describe('fastdoc', () => {
test('small alpha', () => {
const model = new FastDOC(0.1, 0.2, 1.0, 100, 2)
const n = 100
const x = Matrix.concat(Matrix.randn(n, 3, [0, 5, 0], 0.1), Matrix.randn(n, 3, [10, 5, 10], 0.1)).toArray()

model.fit(x)
const y = model.predict()
expect(y).toHaveLength(x.length)

const t = []
for (let i = 0; i < x.length; i++) {
t[i] = Math.floor(i / n)
}
const ri = randIndex(y, t)
expect(ri).toBeGreaterThan(0.9)
})
})

0 comments on commit f2bbcfe

Please sign in to comment.