Skip to content

textparser test #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v6
with:
version: v1.60
version: v1.64

build-and-push-image:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -51,7 +51,7 @@ jobs:
uses: docker/setup-buildx-action@v1

- name: Cache Docker layers
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}
Expand Down
2 changes: 2 additions & 0 deletions commands/youtube/youtube.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build exclude

package youtube

import (
Expand Down
1 change: 1 addition & 0 deletions parsers.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
_ "github.com/Gasoid/regular-go-bot/parsers"
_ "github.com/Gasoid/regular-go-bot/parsers/instagram"
_ "github.com/Gasoid/regular-go-bot/parsers/location"
_ "github.com/Gasoid/regular-go-bot/parsers/speech"
)
248 changes: 248 additions & 0 deletions parsers/instagram/instagram.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
package instagram

import (
"fmt"
"io"
"log/slog"
"net/http"
"os"
"regexp"
"strings"

"github.com/Gasoid/regular-go-bot/parsers"
)

type InstagramParser struct{}

func (p *InstagramParser) Name() string {
return "instagram"
}

func (p *InstagramParser) Handler(text string, callback parsers.Callback) error {
instagramURLs := extractInstagramURLs(text)
if len(instagramURLs) == 0 {
return nil // No Instagram URLs found, do nothing
}

for _, url := range instagramURLs {
if err := processInstagramURL(url, callback); err != nil {
slog.Error("failed to process Instagram URL", "url", url, "error", err)
callback.ReplyMessage(fmt.Sprintf("Failed to process Instagram content: %v", err))
}
}

return nil
}

func extractInstagramURLs(text string) []string {
// Regex pattern to match Instagram URLs
pattern := `https?://(?:www\.)?instagram\.com/(?:p|reel)/[A-Za-z0-9_-]+/?`
re := regexp.MustCompile(pattern)
return re.FindAllString(text, -1)
}

func processInstagramURL(url string, callback parsers.Callback) error {
slog.Info("Processing Instagram URL", "url", url)

// Try to extract media using a simple approach
// Note: Instagram has strict anti-scraping measures, so this is a simplified approach
// In production, you might need to use specialized services or APIs

mediaInfo, err := extractMediaInfo(url)
if err != nil {
return fmt.Errorf("failed to extract media info: %w", err)
}

if mediaInfo.IsVideo {
// Download and send video
filePath, err := downloadMedia(mediaInfo.URL, "video")
if err != nil {
return fmt.Errorf("failed to download video: %w", err)
}
defer os.Remove(filePath)

callback.SendVideo(filePath)
} else {
// Download and send photo
filePath, err := downloadMedia(mediaInfo.URL, "photo")
if err != nil {
return fmt.Errorf("failed to download photo: %w", err)
}
defer os.Remove(filePath)

callback.SendPhoto(filePath, "📸 Instagram photo")
}

return nil
}

type MediaInfo struct {
URL string
IsVideo bool
Caption string
}

func extractMediaInfo(instagramURL string) (*MediaInfo, error) {
// Instagram has complex anti-scraping measures. This is a simplified approach.
// For production use, consider:
// 1. Instagram Basic Display API (requires user authentication)
// 2. Third-party services like RapidAPI Instagram scrapers
// 3. Browser automation tools

// Try to get the page content with proper headers
client := &http.Client{}
req, err := http.NewRequest("GET", instagramURL, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}

// Set headers to mimic a real browser
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
req.Header.Set("Accept-Encoding", "gzip, deflate, br")
req.Header.Set("DNT", "1")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Upgrade-Insecure-Requests", "1")

resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to fetch Instagram page: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("instagram returned status %d", resp.StatusCode)
}

body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}

bodyStr := string(body)

// Try to extract media from various possible JSON structures
// Look for video URLs first
videoPatterns := []string{
`"video_url":"([^"]+)"`,
`"src":"([^"]+\.mp4[^"]*)"`,
`videoUrl":"([^"]+)"`,
}

for _, pattern := range videoPatterns {
videoRe := regexp.MustCompile(pattern)
if match := videoRe.FindStringSubmatch(bodyStr); len(match) > 1 {
videoURL := strings.ReplaceAll(match[1], "\\u0026", "&")
videoURL = strings.ReplaceAll(videoURL, "\\/", "/")
return &MediaInfo{
URL: videoURL,
IsVideo: true,
}, nil
}
}

// Look for image URLs
imagePatterns := []string{
`"display_url":"([^"]+)"`,
`"src":"([^"]+\.jpg[^"]*)"`,
`"src":"([^"]+\.jpeg[^"]*)"`,
`"thumbnail_src":"([^"]+)"`,
}

for _, pattern := range imagePatterns {
imageRe := regexp.MustCompile(pattern)
if match := imageRe.FindStringSubmatch(bodyStr); len(match) > 1 {
imageURL := strings.ReplaceAll(match[1], "\\u0026", "&")
imageURL = strings.ReplaceAll(imageURL, "\\/", "/")
return &MediaInfo{
URL: imageURL,
IsVideo: false,
}, nil
}
}

// If direct extraction fails, try a fallback approach
// Look for any high-resolution image URLs in the page
fallbackPattern := `https://[^"]*\.(?:jpg|jpeg|png|mp4)[^"]*`
fallbackRe := regexp.MustCompile(fallbackPattern)
matches := fallbackRe.FindAllString(bodyStr, -1)

for _, match := range matches {
if strings.Contains(match, "instagram") && (strings.Contains(match, "jpg") || strings.Contains(match, "jpeg") || strings.Contains(match, "png")) {
return &MediaInfo{
URL: match,
IsVideo: false,
}, nil
}
if strings.Contains(match, "instagram") && strings.Contains(match, "mp4") {
return &MediaInfo{
URL: match,
IsVideo: true,
}, nil
}
}

return nil, fmt.Errorf("no media found in Instagram post - Instagram may have blocked access")
}

func downloadMedia(url, mediaType string) (string, error) {
// Create HTTP client with proper headers
client := &http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}

// Set headers to avoid being blocked
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
req.Header.Set("Accept", "*/*")
req.Header.Set("Referer", "https://www.instagram.com/")

resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("failed to download media: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("download failed with status %d", resp.StatusCode)
}

// Determine file extension based on content type or URL
var ext string
contentType := resp.Header.Get("Content-Type")

switch {
case strings.Contains(contentType, "video/mp4") || strings.Contains(url, ".mp4"):
ext = ".mp4"
case strings.Contains(contentType, "image/jpeg") || strings.Contains(url, ".jpg") || strings.Contains(url, ".jpeg"):
ext = ".jpg"
case strings.Contains(contentType, "image/png") || strings.Contains(url, ".png"):
ext = ".png"
case mediaType == "video":
ext = ".mp4"
default:
ext = ".jpg"
}

// Create temporary file
tmpFile, err := os.CreateTemp("", fmt.Sprintf("instagram_%s_*%s", mediaType, ext))
if err != nil {
return "", fmt.Errorf("failed to create temp file: %w", err)
}
defer tmpFile.Close()

// Copy media data to file
_, err = io.Copy(tmpFile, resp.Body)
if err != nil {
return "", fmt.Errorf("failed to save media: %w", err)
}

slog.Info("Downloaded Instagram media", "file", tmpFile.Name(), "type", mediaType, "size", tmpFile.Name())
return tmpFile.Name(), nil
}

func init() {
parsers.RegisterTextParser(&InstagramParser{})
}
10 changes: 10 additions & 0 deletions parsers/parsers.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ type Parser interface {
var (
voiceParsers = []Parser{}
locationParsers = []Parser{}
textParsers = []Parser{}
)

type Wrapper struct {
Expand Down Expand Up @@ -48,8 +49,17 @@ func ListLocationParsers() []Parser {
return locationParsers
}

func RegisterTextParser(parser Parser) {
textParsers = append(textParsers, &Wrapper{parser})
}

func ListTextParsers() []Parser {
return textParsers
}

type Callback struct {
SendMessage func(text string)
SendVideo func(filePath string)
SendPhoto func(filePath, caption string)
ReplyMessage func(text string)
}
58 changes: 58 additions & 0 deletions telegram/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,64 @@ func commandHandler(c commands.Command) func(ctx context.Context, b *bot.Bot, up
}

func defaultHandler(ctx context.Context, b *bot.Bot, update *models.Update) {
// Handle text messages (including Instagram URLs)
if update.Message.Text != "" {
for _, p := range parsers.ListTextParsers() {
err := p.Handler(update.Message.Text, parsers.Callback{
SendMessage: func(text string) {
b.SendMessage(ctx, &bot.SendMessageParams{
ChatID: update.Message.Chat.ID,
Text: text,
})
},
SendVideo: func(filePath string) {
f, err := os.Open(filePath)
if err != nil {
slog.Error("file not found", "err", err)
return
}
defer f.Close()

b.SendVideo(ctx, &bot.SendVideoParams{
ChatID: update.Message.Chat.ID,
Video: &models.InputFileUpload{
Data: f,
Filename: "video",
},
})
},
SendPhoto: func(filePath, caption string) {
fileData, err := os.ReadFile(filePath)
if err != nil {
slog.Error("file not found", "err", err)
return
}

params := &bot.SendPhotoParams{
ChatID: update.Message.Chat.ID,
Photo: &models.InputFileUpload{Filename: "image.jpg", Data: bytes.NewReader(fileData)},
Caption: caption,
}

b.SendPhoto(ctx, params)
},
ReplyMessage: func(text string) {
b.SendMessage(ctx, &bot.SendMessageParams{
ChatID: update.Message.Chat.ID,
Text: text,
ReplyParameters: &models.ReplyParameters{
MessageID: update.Message.ID,
ChatID: update.Message.Chat.ID,
},
})
},
})
if err != nil {
slog.Error("p.Handler", "err", err)
}
}
}

if update.Message.Location != nil {
for _, p := range parsers.ListLocationParsers() {
err := p.Handler(fmt.Sprintf("%f,%f", update.Message.Location.Latitude, update.Message.Location.Longitude), parsers.Callback{
Expand Down
Loading