From 169ec84ef0c191356f2eba92584b941131f78809 Mon Sep 17 00:00:00 2001 From: Dmitry Ledentsov Date: Tue, 17 Sep 2024 17:23:58 +0200 Subject: [PATCH] feat: limiting processing the response body via `limitBodyToNBytes` when `searchForBodyPatterns==true` take 2 --- .link-checker-service.toml | 1 + CHANGES.md | 6 ++ cmd/root.go | 3 + infrastructure/url_checker.go | 57 ++++++++++++++- infrastructure/url_checker_test.go | 107 ++++++++++++++++++++++++++++- 5 files changed, 172 insertions(+), 2 deletions(-) diff --git a/.link-checker-service.toml b/.link-checker-service.toml index 12171d8..4817d88 100644 --- a/.link-checker-service.toml +++ b/.link-checker-service.toml @@ -89,6 +89,7 @@ regex = "Login Service" [HTTPClient] maxRedirectsCount = 15 +limitBodyToNBytes = 10000000000 timeoutSeconds = 45 userAgent = "lcs/0.9" browserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36" diff --git a/CHANGES.md b/CHANGES.md index cc08632..bfc5ddb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,12 @@ Notable changes will be documented here +## 0.9.37 + +- limiting processing the response body via `limitBodyToNBytes` when `searchForBodyPatterns==true` +- upgraded dependencies +- Go v1.23 + ## 0.9.36 - upgraded dependencies diff --git a/cmd/root.go b/cmd/root.go index 9899d34..09d2b82 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -42,6 +42,7 @@ const ( proxyKey = "proxy" pacScriptURLKey = "pacScriptURL" maxRedirectsCountKey = "maxRedirectsCount" + limitBodyToNBytesKey = "limitBodyToNBytes" timeoutSecondsKey = "timeoutSeconds" userAgentKey = "userAgent" browserUserAgentKey = "browserUserAgent" @@ -102,6 +103,8 @@ func init() { _ = viper.BindPFlag(httpClientMapKey+skipCertificateCheckKey, rootCmd.PersistentFlags().Lookup(skipCertificateCheckKey)) rootCmd.PersistentFlags().Bool(enableRequestTracingKey, false, "HTTP client: enable request tracing") _ = viper.BindPFlag(httpClientMapKey+enableRequestTracingKey, rootCmd.PersistentFlags().Lookup(enableRequestTracingKey)) + rootCmd.PersistentFlags().Uint(limitBodyToNBytesKey, 0, "HTTP client: maximum number of bytes to read from the body when searching for patterns. Unlimited if 0!") + _ = viper.BindPFlag(httpClientMapKey+limitBodyToNBytesKey, rootCmd.PersistentFlags().Lookup(limitBodyToNBytesKey)) // service rootCmd.PersistentFlags().UintP(maxConcurrentHTTPRequestsKey, "c", 256, "maximum number of total concurrent HTTP requests") _ = viper.BindPFlag(maxConcurrentHTTPRequestsKey, rootCmd.PersistentFlags().Lookup(maxConcurrentHTTPRequestsKey)) diff --git a/infrastructure/url_checker.go b/infrastructure/url_checker.go index eb6e736..a2ff0d9 100644 --- a/infrastructure/url_checker.go +++ b/infrastructure/url_checker.go @@ -10,6 +10,7 @@ import ( "context" "crypto/tls" "fmt" + "io" "log" "net" "net/http" @@ -30,6 +31,7 @@ import ( "github.com/go-resty/resty/v2" ) +const defaultLimitBodyToNBytes = 0 const defaultMaxRedirectsCount = 15 const defaultTimeoutSeconds = 10 const defaultUserAgent = "lcs/0.9" @@ -72,6 +74,7 @@ type urlCheckerSettings struct { EnableRequestTracing bool URLCheckerPlugins []string PacScriptURL string + LimitBodyToNBytes uint } // URLChecker interface that all layers should conform to @@ -203,6 +206,7 @@ func getURLCheckerSettings() urlCheckerSettings { UserAgent: defaultUserAgent, BrowserUserAgent: defaultBrowserUserAgent, AcceptHeader: defaultAcceptHeader, + LimitBodyToNBytes: defaultLimitBodyToNBytes, } if proxyURL := viper.GetString("proxy"); proxyURL != "" { @@ -220,6 +224,7 @@ func getURLCheckerSettings() urlCheckerSettings { } s.MaxRedirectsCount = viper.GetUint("HTTPClient.maxRedirectsCount") + s.LimitBodyToNBytes = viper.GetUint("HTTPClient.limitBodyToNBytes") s.TimeoutSeconds = viper.GetUint("HTTPClient.timeoutSeconds") if v := viper.GetString("HTTPClient.userAgent"); v != "" { s.UserAgent = v @@ -240,6 +245,7 @@ func getURLCheckerSettings() urlCheckerSettings { log.Printf("HTTP client AcceptHeader: %v", s.AcceptHeader) log.Printf("HTTP client SkipCertificateCheck: %v", s.SkipCertificateCheck) log.Printf("HTTP client EnableRequestTracing: %v", s.EnableRequestTracing) + log.Printf("HTTP client LimitBodyToNBytes: %v", s.LimitBodyToNBytes) // advanced configuration feature: only configurable via the config file s.SearchForBodyPatterns = viper.GetBool("searchForBodyPatterns") @@ -485,11 +491,12 @@ func (c *URLCheckerClient) tryGetRequestAndProcessResponseBody(ctx context.Conte response, err := client.R(). SetHeader("Accept", c.settings.AcceptHeader). SetContext(ctx). + SetDoNotParseResponse(true). SetHeader("User-Agent", c.settings.BrowserUserAgent). Get(urlToCheck) res = c.processResponse(urlToCheck, response, err) if c.settings.SearchForBodyPatterns && response != nil { - body = response.String() + body = c.limitedBody(response) } } @@ -641,6 +648,54 @@ func (c *URLCheckerClient) tryHeadRequestAsBrowserIfForbidden(ctx context.Contex return res } +func (c *URLCheckerClient) limitedBody(response *resty.Response) string { + body := response.RawBody() + defer body.Close() + return safelyTrimmedStream(body, c.settings.LimitBodyToNBytes) +} + +func safelyTrimmedStream(input io.Reader, limit uint) string { + res := []byte{} + if limit == 0 { + b, err := io.ReadAll(input) + if err != nil { + if b != nil { + res = b + } + return string(safelyTrimmedString(res, limit)) + } + return string(b) + } + + const bufferSize = 1024 + b := [bufferSize]byte{} + bytesRead := 0 + for { + n, err := input.Read(b[:]) + + if err != nil { + // first append bytes read so far + res = append(res, b[:n]...) + return string(safelyTrimmedString(res, limit)) + } + + res = append(res, b[:n]...) + bytesRead += n + + if uint(bytesRead) >= limit { + break + } + } + return string(safelyTrimmedString(res, limit)) +} + +func safelyTrimmedString(s []byte, limit uint) []byte { + if limit == 0 || len(s) <= int(limit) { + return s + } + return s[:limit] +} + func buildClient(settings urlCheckerSettings) *resty.Client { client := resty.New() client.SetTimeout(time.Second * time.Duration(settings.TimeoutSeconds)) diff --git a/infrastructure/url_checker_test.go b/infrastructure/url_checker_test.go index e510b57..715b2ea 100644 --- a/infrastructure/url_checker_test.go +++ b/infrastructure/url_checker_test.go @@ -8,8 +8,15 @@ package infrastructure import ( "context" + "errors" + "fmt" + "github.com/stretchr/testify/require" + "io" + "log" "net/http" + "net/http/httptest" "os" + "strings" "testing" "time" @@ -36,10 +43,11 @@ func TestOkUrls(t *testing.T) { func TestSearchingForBodyPatterns(t *testing.T) { setUpViperTestConfiguration() viper.Set("searchForBodyPatterns", true) + viper.Set("HTTPClient.limitBodyToNBytes", uint(0)) res := NewURLCheckerClient().CheckURL(context.Background(), "https://google.com") assert.Nil(t, res.Error) assert.Equal(t, http.StatusOK, res.Code) - assert.Len(t, res.BodyPatternsFound, 1) + require.Contains(t, res.BodyPatternsFound, "google") assert.Equal(t, "google", res.BodyPatternsFound[0], "should have found at least one mention of google") } @@ -71,6 +79,7 @@ func setUpViperTestConfiguration() { viper.Set("HTTPClient.timeoutSeconds", uint(15)) viper.Set("HTTPClient.maxRedirectsCount", uint(15)) viper.Set("HTTPClient.enableRequestTracing", false) + viper.Set("HTTPClient.limitBodyToNBytes", uint(0)) viper.Set("searchForBodyPatterns", false) viper.Set("urlCheckerPlugins", []string{}) patterns := []struct { @@ -78,6 +87,8 @@ func setUpViperTestConfiguration() { Regex string }{ {"google", "google"}, + {"start-a", "start-a"}, + {"ab", "ab"}, } viper.Set("bodyPatterns", patterns) } @@ -143,3 +154,97 @@ func TestResponseTimeout(t *testing.T) { assert.NotNil(t, res.Error, "the response should have failed due to the abort") assert.NotEqual(t, http.StatusOK, res.Code) } + +const startChunk = "start-" + +var testStringToLimit = startChunk + + strings.Repeat("a", 300) + + strings.Repeat("b", 300) + +func TestLimitingBodyReading(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = fmt.Fprintln(w, + testStringToLimit) + })) + log.Println("Test server started at:", ts.URL) + defer ts.Close() + setUpViperTestConfiguration() + viper.Set("searchForBodyPatterns", true) + viper.Set("HTTPClient.limitBodyToNBytes", uint(100)) + res := NewURLCheckerClient().CheckURL(context.Background(), ts.URL) + assert.Equal(t, http.StatusOK, res.Code) + assert.Contains(t, res.BodyPatternsFound, "start-a") + assert.NotContains( + t, + res.BodyPatternsFound, + "ab", + "the repeated 'b' part of the message should have not been processed", + ) +} + +func Test_safelyTrimmedStream(t *testing.T) { + t.Run("limiting empty input produces empty string", func(t *testing.T) { + assert.Equal(t, "", safelyTrimmedStream(streamOf(""), 10)) + }) + + t.Run("non-empty input is not limited if no limit configured", func(t *testing.T) { + assert.Equal(t, testStringToLimit, safelyTrimmedStream(streamOf(testStringToLimit), 0)) + }) + + t.Run("limiting input to a size smaller than a chunk returns string of the limit length", + func(t *testing.T) { + assert.Equal(t, startChunk, safelyTrimmedStream(streamOf(testStringToLimit), uint(len(startChunk)))) + }) + + t.Run("limiting input to a size larger than itself returns the original string", + func(t *testing.T) { + assert.Equal(t, testStringToLimit, safelyTrimmedStream(streamOf(testStringToLimit), 2000)) + }) + + t.Run("limiting input to one byte results in one character", + func(t *testing.T) { + assert.Equal(t, 1, len(safelyTrimmedStream(streamOf(testStringToLimit), 1))) + }) + + t.Run("limiting input larger than the the buffer (1kB) to a limit larger than the buffer trims the input", + func(t *testing.T) { + assert.Equal(t, 1200, len(safelyTrimmedStream(streamOf( + strings.Repeat(testStringToLimit, 2), + ), 1200))) + }) + + t.Run("trimming the errored stream returns the input processed", func(t *testing.T) { + assert.Equal(t, "abc", safelyTrimmedStream(faultyReaderOf( + "abc,d", 3, + ), 10)) + }) + + t.Run("untrimmed errored stream returns the input processed", func(t *testing.T) { + assert.Equal(t, "abc", safelyTrimmedStream(faultyReaderOf( + "abc,d", 3, + ), 0)) + }) +} + +type faultyReader struct { + input string + errorAt int +} + +func (f *faultyReader) Read(p []byte) (int, error) { + for i := 0; i < f.errorAt; i++ { + p[i] = f.input[i] + } + return f.errorAt, errors.New("expected fault") +} + +func faultyReaderOf(s string, i int) io.Reader { + return &faultyReader{ + input: s, + errorAt: i, + } +} + +func streamOf(s string) io.Reader { + return strings.NewReader(s) +}