Skip to content

Commit

Permalink
tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
boyter committed Jun 19, 2024
1 parent 33e699e commit 042a1a1
Show file tree
Hide file tree
Showing 21 changed files with 2,068 additions and 31 deletions.
662 changes: 658 additions & 4 deletions LICENSE

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions NOTES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
Detection Rules
---------------

Check the filename

"li[cs]en[cs]e(s?)",
"legal",
"copy(left|right|ing)",
"unlicense",
"l?gpl([-_ v]?)(\\d\\.?\\d)?",
"bsd",
"mit",
"apache",

If it falls into one of the above, its highly likely it is a licence,
and should be tested. Note that the name itself highly indicates the
licence itself, with unlicense for example indicating it is the unlicnse.

Something like licence, legal, or copy(left|right|ing) needs to be checked
because while it is highly likely to have a licence we cannot be sure
as to which licence it actually is. Its also possible that these examples
could have multiple licenses in them. Example github.com/valkey/valkey/COPYING

"",
".md",
".rst",
".html",
".txt",

Where the file matchs the above patterns, where it has has no extention or
one of the others we should inspect it to see if it has a license. Its possible
a licence exists here, but we cannot be sure. Note that its possible there are multiple
licences in the file which needs to be dealt with.

// SPDX-License-Identifier: MIT OR Unlicense

For all other files, there are a few possibilities.
The first is that it contains a SPDX header such as the above which indicates
which license the file is under. Its also possible that the header will contain
a full copy of another licence such as MIT, GPL or otherwise. Possibly inside a comment
or a long string declaration in the case of code. Its possible it has multiple.

3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.20

require (
github.com/BobuSumisu/aho-corasick v1.0.3
github.com/boyter/gocodewalker v1.1.0
github.com/boyter/gocodewalker v1.3.3
github.com/spf13/cobra v1.0.0
github.com/tealeg/xlsx v1.0.3
github.com/texttheater/golang-levenshtein v0.0.0-20180516184445-d188e65d659e
Expand All @@ -14,4 +14,5 @@ require (
github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964 // indirect
github.com/inconshreveable/mousetrap v1.0.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
golang.org/x/sync v0.7.0 // indirect
)
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ github.com/boyter/gocodewalker v1.0.1-0.20230321052529-86e91fcf4a3a h1:/r8uPJqza
github.com/boyter/gocodewalker v1.0.1-0.20230321052529-86e91fcf4a3a/go.mod h1:CppTdM9RtednxKPcit+Zn36FXqwYaDIGufs/tenbswo=
github.com/boyter/gocodewalker v1.1.0 h1:R/BOXRB5WcE5x/Q6Ln95GUQOQE4PAk86nq6WccZqZmM=
github.com/boyter/gocodewalker v1.1.0/go.mod h1:CppTdM9RtednxKPcit+Zn36FXqwYaDIGufs/tenbswo=
github.com/boyter/gocodewalker v1.3.3 h1:yPSbWT1wGmPSC73jASY0GaJu4EDN3FROfwYUDQjTmuE=
github.com/boyter/gocodewalker v1.3.3/go.mod h1:hXG8xzR1uURS+99P5/3xh3uWHjaV2XfoMMmvPyhrCDg=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk=
Expand Down Expand Up @@ -118,6 +120,8 @@ golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAG
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
Expand Down
96 changes: 96 additions & 0 deletions pkg/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// SPDX-License-Identifier: AGPL-3.0

package pkg

import (
"fmt"
"regexp"
"strings"
)

var commonLicences = []string{"MIT", "Apache-2.0", "GPL-3.0", "AGPL-3.0", "BSD-3-Clause", "GPL-2.0", "BSD-2-Clause", "CC0-1.0", "LGPL-3.0", "LGPL-2.1", "ISC", "0BSD", "LGPL-2.0", "Unlicense", "BSD-3-Clause-No-Nuclear-License-2014", "MPL-2.0", "EPL-1.0", "MPL-2.0-no-copyleft-exception", "AGPL-1.0", "CC-BY-4.0", "IPL-1.0", "CPL-1.0", "CC-BY-3.0", "CC-BY-SA-4.0", "WTFPL", "Zlib", "CC-BY-SA-3.0", "Cube", "JSON", "BitTorrent-1.0"}

// Lifted from https://github.com/go-enry/go-license-detector/blob/580c5627556917dee649cdb2b179cb42d6c56a60/licensedb/internal/investigation.go#L29
// SPDX-License-Identifier: Apache-2.0
var (
// Base names of guessable license files
licenseFileNames = []string{
"li[cs]en[cs]e(s?)",
"legal",
"copy(left|right|ing)",
"unlicense",
"l?gpl([-_ v]?)(\\d\\.?\\d)?",
"bsd",
"mit",
"apache",
}

// License file extensions. Combined with the fileNames slice
// to create a set of files we can reasonably assume contain
// licensing information.
fileExtensions = []string{
"",
".md",
".rst",
".html",
".txt",
}

licenseFileRe = regexp.MustCompile(
fmt.Sprintf("^(|.*[-_. ])(%s)(|[-_. ].*)$",
strings.Join(licenseFileNames, "|")))

readmeFileRe = regexp.MustCompile(fmt.Sprintf("^(readme|guidelines)(%s)$",
strings.Replace(strings.Join(fileExtensions, "|"), ".", "\\.", -1)))
)

func IsLicenceFile(filename string) bool {
// attempt to filter out false positives that come from java due to filenames
if strings.Count(filename, ".") > 2 {
return false
}

return licenseFileRe.Match([]byte(strings.ToLower(filename)))
}

func IsReadmeFile(filename string) bool {
return readmeFileRe.Match([]byte(strings.ToLower(filename)))
}

func compareOptimize(input string) string {
tokens := strings.Fields(input)
var sb strings.Builder
skipTokens := map[string]struct{}{}
for i := 0; i < len(tokens); i++ {
tok := tokens[i]
foundLonger := false

// if we have already looked at this token, skip it, important for performance
_, ok := skipTokens[tok]
if ok {
continue
}
skipTokens[tok] = struct{}{}

for j := i; j < len(tokens); j++ {
tok2 := tokens[j]
if tok == tok2 {
continue
}

if len(tok2) <= len(tok) {
continue
}

if strings.Contains(tok2, tok) {
foundLonger = true
}
}

if !foundLonger {
sb.WriteString(tok)
}
}

return sb.String()
}
5 changes: 5 additions & 0 deletions pkg/constants.go

Large diffs are not rendered by default.

112 changes: 112 additions & 0 deletions pkg/detector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// SPDX-License-Identifier: AGPL-3.0

package pkg

import (
"encoding/base64"
"encoding/json"
"github.com/boyter/lc/pkg/levenshtein"
"math"
"strings"
)

type LicenceDetector struct {
Database []License
}

func NewLicenceDetector() *LicenceDetector {
l := new(LicenceDetector)
l.LoadDatabase()
return l
}

type License struct {
LicenseTexts []string `json:"licenseTexts"` // examples of text that we have for these licences
LicenseIds []string `json:"licenseIds"` // SPDX ids where licences are considered identical
Keywords []string `json:"keywords"` // keywords that are unique and can be used to identify this group of licences
}

// LoadDatabase will initialize the database values and should only be called once such as in an init
func (l *LicenceDetector) LoadDatabase() {
if len(l.Database) != 0 {
return
}

data, _ := base64.StdEncoding.DecodeString(database_keywords)
_ = json.Unmarshal(data, &l.Database)
}

type LicenseGuess struct {
Name string
}

func (l *LicenceDetector) Guess(filename string, content string) []LicenseGuess {
if IsLicenceFile(filename) {
// Check if the filename matches on of the common licences in which case return that
// since it seems unlikely someone would add a file called LGPL-2.0 without
// it actually being that licence
for _, li := range commonLicences {
if strings.EqualFold(filename, li) {
return []LicenseGuess{
{
Name: li,
},
}
}
}

// at this point we are confident we have a licence file, but we don't know which one, so lets
// start by firstly assuming there is only 1 license in the file
// and then try to determine what is actually inside the file
var bestGuess License
bestMatch := math.MaxInt
con := []rune(compareOptimize(content))
for _, li := range l.Database {
for _, lic := range li.LicenseTexts {
m := levenshtein.DistanceForStrings([]rune(compareOptimize(lic)), con, levenshtein.DefaultOptions)
if m < bestMatch {
bestGuess = li
bestMatch = m
}
}
}

if len(bestGuess.LicenseIds) != 0 {
return []LicenseGuess{
{
Name: bestGuess.LicenseIds[0],
},
}
}

return nil
}

if IsReadmeFile(filename) {
// at this point we are confident we have a licence file, but we don't know which one, so lets
// start by firstly assuming there is only 1 license in the file
// and then try to determine what is actually inside the file
var bestGuess License
bestMatch := math.MaxInt
con := []rune(compareOptimize(content))
for _, li := range l.Database {
for _, lic := range li.LicenseTexts {
m := levenshtein.DistanceForStrings([]rune(compareOptimize(lic)), con, levenshtein.DefaultOptions)
if m < bestMatch {
bestGuess = li
bestMatch = m
}
}
}

if len(bestGuess.LicenseIds) != 0 {
return []LicenseGuess{
{
Name: bestGuess.LicenseIds[0],
},
}
}
}

return nil
}
Loading

0 comments on commit 042a1a1

Please sign in to comment.