From b43a4aad532da186ba7e28621d0898b7af2a1cfd Mon Sep 17 00:00:00 2001 From: omegaatt36 Date: Sun, 4 Aug 2024 22:26:59 +0800 Subject: [PATCH] feat: support www.threads.net --- .github/workflows/stream_threads.yml | 31 ++++++ README.md | 1 + app/register.go | 1 + extractors/threads/threads.go | 151 +++++++++++++++++++++++++++ extractors/threads/threads_test.go | 56 ++++++++++ 5 files changed, 240 insertions(+) create mode 100644 .github/workflows/stream_threads.yml create mode 100644 extractors/threads/threads.go create mode 100644 extractors/threads/threads_test.go diff --git a/.github/workflows/stream_threads.yml b/.github/workflows/stream_threads.yml new file mode 100644 index 000000000..75c975c76 --- /dev/null +++ b/.github/workflows/stream_threads.yml @@ -0,0 +1,31 @@ +name: instagram + +on: + push: + paths: + - "extractors/threads/*.go" + - ".github/workflows/stream_threads.yml" + pull_request: + paths: + - "extractors/threads/*.go" + - ".github/workflows/stream_threads.yml" + schedule: + # run ci weekly + - cron: "0 0 * * 0" + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + go: ["1.22"] + os: [ubuntu-latest] + name: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go }} + + - name: Test + run: go test -timeout 5m -race -coverpkg=./... -coverprofile=coverage.txt github.com/iawia002/lux/extractors/threads diff --git a/README.md b/README.md index 81a9bbf99..2a925e750 100644 --- a/README.md +++ b/README.md @@ -622,6 +622,7 @@ $ lux -j "https://www.bilibili.com/video/av20203945" | 秒拍 | | ✓ | | | | | [![miaopai](https://github.com/iawia002/lux/actions/workflows/stream_miaopai.yml/badge.svg)](https://github.com/iawia002/lux/actions/workflows/stream_miaopai.yml) | | 微博 | | ✓ | | | | | [![weibo](https://github.com/iawia002/lux/actions/workflows/stream_weibo.yml/badge.svg)](https://github.com/iawia002/lux/actions/workflows/stream_weibo.yml) | | Instagram | | ✓ | ✓ | | | | [![instagram](https://github.com/iawia002/lux/actions/workflows/stream_instagram.yml/badge.svg)](https://github.com/iawia002/lux/actions/workflows/stream_instagram.yml) | +| Threads | | ✓ | ✓ | | | | [![threads](https://github.com/iawia002/lux/actions/workflows/stream_threads.yml/badge.svg)](https://github.com/iawia002/lux/actions/workflows/stream_threads.yml) | | Twitter | | ✓ | | | | | [![twitter](https://github.com/iawia002/lux/actions/workflows/stream_twitter.yml/badge.svg)](https://github.com/iawia002/lux/actions/workflows/stream_twitter.yml) | | 腾讯视频 | | ✓ | | | | | [![qq](https://github.com/iawia002/lux/actions/workflows/stream_qq.yml/badge.svg)](https://github.com/iawia002/lux/actions/workflows/stream_qq.yml) | | 网易云音乐 | | ✓ | | | | | [![netease](https://github.com/iawia002/lux/actions/workflows/stream_netease.yml/badge.svg)](https://github.com/iawia002/lux/actions/workflows/stream_netease.yml) | diff --git a/app/register.go b/app/register.go index a7b0eb60b..dcce480b0 100644 --- a/app/register.go +++ b/app/register.go @@ -28,6 +28,7 @@ import ( _ "github.com/iawia002/lux/extractors/rumble" _ "github.com/iawia002/lux/extractors/streamtape" _ "github.com/iawia002/lux/extractors/tangdou" + _ "github.com/iawia002/lux/extractors/threads" _ "github.com/iawia002/lux/extractors/tiktok" _ "github.com/iawia002/lux/extractors/tumblr" _ "github.com/iawia002/lux/extractors/twitter" diff --git a/extractors/threads/threads.go b/extractors/threads/threads.go new file mode 100644 index 000000000..f2ef301b8 --- /dev/null +++ b/extractors/threads/threads.go @@ -0,0 +1,151 @@ +package threads + +import ( + "fmt" + "net" + "net/http" + netURL "net/url" + "strings" + "time" + + "github.com/gocolly/colly/v2" + "github.com/pkg/errors" + + "github.com/iawia002/lux/extractors" + "github.com/iawia002/lux/request" + "github.com/iawia002/lux/utils" +) + +func init() { + extractors.Register("threads", New()) +} + +type extractor struct { + client *http.Client +} + +// New returns a instagram extractor. +func New() extractors.Extractor { + return &extractor{ + client: &http.Client{ + Timeout: 10 * time.Second, + Transport: &http.Transport{ + Dial: (&net.Dialer{ + Timeout: 5 * time.Second, + }).Dial, + TLSHandshakeTimeout: 5 * time.Second, + }, + }, + } +} + +type media struct { + URL string + Type extractors.DataType +} + +// Extract is the main function to extract the data. +func (e *extractor) Extract(url string, option extractors.Options) ([]*extractors.Data, error) { + URL, err := netURL.Parse(url) + if err != nil { + return nil, errors.WithStack(err) + } + + paths := strings.Split(URL.Path, "/") + if len(paths) < 3 { + return nil, errors.New("invalid URL format") + } + + poster := paths[1] + shortCode := paths[3] + + medias := make([]media, 0) + + title := fmt.Sprintf("Threads %s - %s", poster, shortCode) + + collector := colly.NewCollector() + collector.SetClient(e.client) + + // case single image or video + collector.OnHTML("div.SingleInnerMediaContainer", func(e *colly.HTMLElement) { + if src := e.ChildAttr("img", "src"); src != "" { + medias = append(medias, media{ + URL: src, + Type: extractors.DataTypeImage, + }) + } + if src := e.ChildAttr("video > source", "src"); src != "" { + medias = append(medias, media{ + URL: src, + Type: extractors.DataTypeVideo, + }) + } + }) + + // case multiple image or video + collector.OnHTML("div.MediaScrollImageContainer", func(e *colly.HTMLElement) { + if src := e.ChildAttr("img", "src"); src != "" { + medias = append(medias, media{ + URL: src, + Type: extractors.DataTypeImage, + }) + } + if src := e.ChildAttr("video > source", "src"); src != "" { + medias = append(medias, media{ + URL: src, + Type: extractors.DataTypeVideo, + }) + } + }) + + // title with caption + // collector.OnHTML("span.BodyTextContainer", func(e *colly.HTMLElement) { + // title = e.Text + // }) + + if err := collector.Visit(URL.JoinPath("embed").String()); err != nil { + return nil, fmt.Errorf("failed to send HTTP request to the Threads: %w", errors.WithStack(err)) + } + + var totalSize int64 + var parts []*extractors.Part + + for _, m := range medias { + _, ext, err := utils.GetNameAndExt(m.URL) + if err != nil { + return nil, errors.WithStack(err) + } + fileSize, err := request.Size(m.URL, url) + if err != nil { + return nil, errors.WithStack(err) + } + + part := &extractors.Part{ + URL: m.URL, + Size: fileSize, + Ext: ext, + } + parts = append(parts, part) + } + + for _, part := range parts { + totalSize += part.Size + } + + streams := map[string]*extractors.Stream{ + "default": { + Parts: parts, + Size: totalSize, + }, + } + + return []*extractors.Data{ + { + Site: "Threads www.threads.net", + Title: title, + Type: extractors.DataTypeImage, + Streams: streams, + URL: url, + }, + }, nil +} diff --git a/extractors/threads/threads_test.go b/extractors/threads/threads_test.go new file mode 100644 index 000000000..33ebae135 --- /dev/null +++ b/extractors/threads/threads_test.go @@ -0,0 +1,56 @@ +package threads_test + +import ( + "testing" + + "github.com/iawia002/lux/extractors" + "github.com/iawia002/lux/extractors/threads" + "github.com/iawia002/lux/test" +) + +func TestDownload(t *testing.T) { + tests := []struct { + name string + args test.Args + }{ + { + name: "video test", + args: test.Args{ + URL: "https://www.threads.net/@rowancheung/post/C9xPmHcpfiN", + Title: `Threads @rowancheung - C9xPmHcpfiN`, + Size: 5740684, + }, + }, + { + name: "video shared test", + args: test.Args{ + URL: "https://www.threads.net/@zuck/post/C9xRqbNPbx2", + Title: `Threads @zuck - C9xRqbNPbx2`, + Size: 5740684, + }, + }, + { + name: "image test", + args: test.Args{ + URL: "https://www.threads.net/@zuck/post/C-BoS7lM8sH", + Title: `Threads @zuck - C-BoS7lM8sH`, + Size: 159331, + }, + }, + { + name: "hybrid album test", + args: test.Args{ + URL: "https://www.threads.net/@meta/post/C95Z1DrPNhi", + Title: `Threads @meta - C95Z1DrPNhi`, + Size: 1131229, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data, err := threads.New().Extract(tt.args.URL, extractors.Options{}) + test.CheckError(t, err) + test.Check(t, tt.args, data[0]) + }) + } +}