Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions internal/extractors/facebook/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package facebook

import (
"fmt"
"regexp"

"github.com/govdbot/govd/internal/database"
"github.com/govdbot/govd/internal/models"
"github.com/govdbot/govd/internal/networking"
)

var facebookHost = []string{"facebook"}

var ShareExtractor = &models.Extractor{
ID: "facebook",
DisplayName: "Facebook (Share)",

URLPattern: regexp.MustCompile(`https?://(?:(?:www|m)\.)?facebook\.com/share/(?:r|v|p)/(?P<id>[a-zA-Z0-9]+)`),
Host: facebookHost,

Redirect: true,

GetFunc: func(ctx *models.ExtractorContext) (*models.ExtractorResponse, error) {
finalURL, err := ctx.FetchLocation(
ctx.ContentURL,
&networking.RequestParams{Headers: webHeaders},
)
if err != nil {
return nil, fmt.Errorf("failed to follow share redirect: %w", err)
}
return &models.ExtractorResponse{URL: finalURL}, nil
},
}

var Extractor = &models.Extractor{
ID: "facebook",
DisplayName: "Facebook",

URLPattern: regexp.MustCompile(
`https?://(?:(?:www|m|mbasic)\.)?facebook\.com/` +
`(?:watch/?\?(?:[^&]*&)*v=|(?:reel|videos?|posts?)/|[^/]+/(?:videos|posts|reels?)/)` +
`(?P<id>[a-zA-Z0-9]+)`,
),
Host: facebookHost,

GetFunc: func(ctx *models.ExtractorContext) (*models.ExtractorResponse, error) {
media, err := GetMedia(ctx)
if err != nil {
return nil, err
}
return &models.ExtractorResponse{
Media: media,
}, nil
},
}

func GetMedia(ctx *models.ExtractorContext) (*models.Media, error) {
if ctx.HTTPClient.Cookies == nil {
return nil, fmt.Errorf("auth cookies are required for facebook")
}
videoData, err := GetVideoData(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get video data: %w", err)
}
return buildMedia(ctx, videoData)
}

func buildMedia(ctx *models.ExtractorContext, data *VideoData) (*models.Media, error) {
media := ctx.NewMedia()
if data.Title != "" {
media.SetCaption(data.Title)
}

item := media.NewItem()
var formats []*models.MediaFormat

if data.HDURL != "" {
formats = append(formats, &models.MediaFormat{
FormatID: "hd",
Type: database.MediaTypeVideo,
VideoCodec: database.MediaCodecAvc,
AudioCodec: database.MediaCodecAac,
URL: []string{data.HDURL},
Width: data.Width,
Height: data.Height,
})
}
if data.SDURL != "" {
formats = append(formats, &models.MediaFormat{
FormatID: "sd",
Type: database.MediaTypeVideo,
VideoCodec: database.MediaCodecAvc,
AudioCodec: database.MediaCodecAac,
URL: []string{data.SDURL},
})
}

if len(formats) == 0 {
return nil, fmt.Errorf("no video formats found")
}

item.AddFormats(formats...)
return media, nil
}
10 changes: 10 additions & 0 deletions internal/extractors/facebook/models.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package facebook

// VideoData holds extracted video information from Facebook page HTML.
type VideoData struct {
HDURL string
SDURL string
Title string
Width int32
Height int32
}
173 changes: 173 additions & 0 deletions internal/extractors/facebook/util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
package facebook

import (
"bytes"
"fmt"
"io"
"net/http"
"regexp"
"strings"
"unicode/utf8"

"github.com/govdbot/govd/internal/logger"
"github.com/govdbot/govd/internal/models"
"github.com/govdbot/govd/internal/networking"
)

var webHeaders = map[string]string{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
}

var (
hdURLPattern = regexp.MustCompile(
`"progressive_url"\s*:\s*"([^"\\]*(?:\\.[^"\\]*)*)"\s*,\s*"failure_reason"\s*:\s*[^,]+\s*,\s*"metadata"\s*:\s*\{\s*"quality"\s*:\s*"HD"\s*\}`,
)
sdURLPattern = regexp.MustCompile(
`"progressive_url"\s*:\s*"([^"\\]*(?:\\.[^"\\]*)*)"\s*,\s*"failure_reason"\s*:\s*[^,]+\s*,\s*"metadata"\s*:\s*\{\s*"quality"\s*:\s*"SD"\s*\}`,
)
titlePattern = regexp.MustCompile(
`"title"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(?:\\.[^"\\]*)*)"`,
)
)

func GetVideoData(ctx *models.ExtractorContext) (*VideoData, error) {
contentURL := strings.Replace(ctx.ContentURL, "m.facebook.com", "www.facebook.com", 1)
contentURL = strings.Replace(contentURL, "mbasic.facebook.com", "www.facebook.com", 1)

// convert watch URLs to reel permalink,
// /watch/?v=XXX pages return wrong video data when scraped
if strings.Contains(contentURL, "/watch") && ctx.ContentID != "" {
contentURL = "https://www.facebook.com/reel/" + ctx.ContentID
}

resp, err := ctx.Fetch(
http.MethodGet,
contentURL,
&networking.RequestParams{
Headers: webHeaders,
},
)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()

logger.WriteFile("fb_response", resp)

if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to get page: %s", resp.Status)
}

body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}

return parseVideoFromBody(body, ctx.ContentID)
}

func parseVideoFromBody(body []byte, videoID string) (*VideoData, error) {
data := &VideoData{}

// find the section belonging to the requested video
section := findVideoSection(body, videoID)
if section == nil {
// fall back to full body for reel/post pages with a single video
section = body
}

if match := hdURLPattern.FindSubmatch(section); len(match) >= 2 {
data.HDURL = unescapeFacebookURL(string(match[1]))
}
if match := sdURLPattern.FindSubmatch(section); len(match) >= 2 {
data.SDURL = unescapeFacebookURL(string(match[1]))
}
// title can be anywhere in the page
if match := titlePattern.FindSubmatch(body); len(match) >= 2 {
data.Title = unescapeUnicode(string(match[1]))
}

if data.HDURL == "" && data.SDURL == "" {
return nil, fmt.Errorf("no video URLs found in page")
}

return data, nil
}

// findVideoSection returns the slice of body containing the video delivery
// data for the given videoID, anchored by dash_mpd_debug.mpd?v=VIDEO_ID
// and bounded by the closing "id":"VIDEO_ID".
func findVideoSection(body []byte, videoID string) []byte {
if videoID == "" {
return nil
}

anchor := []byte("dash_mpd_debug.mpd?v=" + videoID)
start := bytes.Index(body, anchor)
if start == -1 {
return nil
}

remaining := body[start:]

// look for "id":"VIDEO_ID" which closes the videoDeliveryResponseResult block
endMarker := []byte(`"id":"` + videoID + `"`)
endIdx := bytes.Index(remaining, endMarker)
if endIdx > 0 {
return remaining[:endIdx+len(endMarker)]
}

// fallback: take a generous window
maxLen := 20000
if maxLen > len(remaining) {
maxLen = len(remaining)
}
return remaining[:maxLen]
}

func unescapeFacebookURL(s string) string {
s = strings.ReplaceAll(s, `\/`, "/")
s = unescapeUnicode(s)
return s
}

func unescapeUnicode(s string) string {
var b strings.Builder
b.Grow(len(s))

for i := 0; i < len(s); {
if i+5 < len(s) && s[i] == '\\' && s[i+1] == 'u' {
var r rune
valid := true
for j := 2; j < 6; j++ {
r <<= 4
c := s[i+j]
switch {
case c >= '0' && c <= '9':
r |= rune(c - '0')
case c >= 'a' && c <= 'f':
r |= rune(c - 'a' + 10)
case c >= 'A' && c <= 'F':
r |= rune(c - 'A' + 10)
default:
valid = false
}
}
if valid && utf8.ValidRune(r) {
b.WriteRune(r)
i += 6
continue
}
}
b.WriteByte(s[i])
i++
}
return b.String()
}
3 changes: 3 additions & 0 deletions internal/extractors/main.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package extractors

import (
"github.com/govdbot/govd/internal/extractors/facebook"
"github.com/govdbot/govd/internal/extractors/instagram"
"github.com/govdbot/govd/internal/extractors/ninegag"
"github.com/govdbot/govd/internal/extractors/pinterest"
Expand All @@ -14,6 +15,8 @@ import (
)

var Extractors = []*models.Extractor{
facebook.ShareExtractor,
facebook.Extractor,
tiktok.Extractor,
tiktok.VMExtractor,
soundcloud.Extractor,
Expand Down
Loading