Skip to content

Commit

Permalink
Merge pull request #177 from NetSepio/vaibhavvvvv/dev
Browse files Browse the repository at this point in the history
scrape_urls : handling routes as urls and change in chromdp task
  • Loading branch information
p-shubh authored Jun 4, 2024
2 parents 468addf + ebb63d2 commit f124bbe
Showing 1 changed file with 22 additions and 9 deletions.
31 changes: 22 additions & 9 deletions api/v1/summary/scrape_urls.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package summary
import (
"context"
"fmt"
"net/url"
"strings"

"github.com/chromedp/cdproto/cdp"
Expand All @@ -25,11 +26,11 @@ func extractLinksFromURL(url string) ([]string, error) {

var links []string

if err := chromedp.Run(ctx, navigateToWebsite(url)); err != nil {
return nil, fmt.Errorf("failed to navigate to website: %v", err)
}

if err := chromedp.Run(ctx, extractLinks(&links)); err != nil {
if err := chromedp.Run(ctx, chromedp.Tasks{
chromedp.Navigate(url),
chromedp.WaitVisible("a", chromedp.ByQuery),
extractLinks(url, &links),
}); err != nil {
return nil, fmt.Errorf("failed to extract links: %v", err)
}

Expand All @@ -40,20 +41,32 @@ func navigateToWebsite(url string) chromedp.Action {
return chromedp.Navigate(url)
}

func extractLinks(links *[]string) chromedp.Action {
func extractLinks(baseURL string, links *[]string) chromedp.Action {
return chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Run(ctx, chromedp.Nodes("a", &nodes)); err != nil {
return err
}

base, err := url.Parse(baseURL)
if err != nil {
return fmt.Errorf("invalid base URL: %v", err)
}

for _, node := range nodes {
href := node.AttributeValue("href")
if href != "" {
*links = append(*links, href)
link, err := url.Parse(href)
if err != nil {
continue
}
if !link.IsAbs() {
link = base.ResolveReference(link)
}
*links = append(*links, link.String())
}
}
// fmt.Println("AL: : ", links)
// fmt.Println("Extracted links: ", *links)
return nil
})
}
Expand Down Expand Up @@ -81,7 +94,7 @@ func extractContentFromLink(link string) (string, error) {
if err := chromedp.Run(ctx, extractText(&content)); err != nil {
return "", fmt.Errorf("failed to extract text content: %v", err)
}
fmt.Println(content)
// fmt.Println(content)

return content, nil
}
Expand Down

0 comments on commit f124bbe

Please sign in to comment.