2024-08-16 22:29:21 +00:00
|
|
|
package youtube
|
|
|
|
|
|
|
|
import (
|
2024-08-22 22:15:44 +00:00
|
|
|
"context"
|
|
|
|
"encoding/json"
|
|
|
|
"flag"
|
|
|
|
"fmt"
|
|
|
|
"github.com/anaskhan96/soup"
|
2024-08-16 22:29:21 +00:00
|
|
|
"github.com/danielmiessler/fabric/common"
|
2024-08-22 22:15:44 +00:00
|
|
|
"google.golang.org/api/option"
|
|
|
|
"google.golang.org/api/youtube/v3"
|
|
|
|
"log"
|
|
|
|
"regexp"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
2024-08-16 22:29:21 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
func NewYouTube() (ret *YouTube) {
|
|
|
|
|
|
|
|
label := "YouTube"
|
|
|
|
ret = &YouTube{}
|
|
|
|
|
|
|
|
ret.Configurable = &common.Configurable{
|
|
|
|
Label: label,
|
|
|
|
EnvNamePrefix: common.BuildEnvVariablePrefix(label),
|
|
|
|
}
|
|
|
|
|
2024-08-17 16:43:56 +00:00
|
|
|
ret.ApiKey = ret.AddSetupQuestion("API key", true)
|
2024-08-16 22:29:21 +00:00
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
type YouTube struct {
|
|
|
|
*common.Configurable
|
2024-08-17 14:17:56 +00:00
|
|
|
ApiKey *common.SetupQuestion
|
2024-08-22 22:15:44 +00:00
|
|
|
|
|
|
|
service *youtube.Service
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) initService() (err error) {
|
|
|
|
if o.service == nil {
|
|
|
|
ctx := context.Background()
|
|
|
|
o.service, err = youtube.NewService(ctx, option.WithAPIKey(o.ApiKey.Value))
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) GetVideoId(url string) (ret string, err error) {
|
|
|
|
if err = o.initService(); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
pattern := `(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})`
|
|
|
|
re := regexp.MustCompile(pattern)
|
|
|
|
match := re.FindStringSubmatch(url)
|
|
|
|
if len(match) > 1 {
|
|
|
|
ret = match[1]
|
|
|
|
} else {
|
|
|
|
err = fmt.Errorf("invalid YouTube URL, can't get video ID")
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) GrabTranscriptForUrl(url string) (ret string, err error) {
|
|
|
|
var videoId string
|
|
|
|
if videoId, err = o.GetVideoId(url); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
return o.GrabTranscript(videoId)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) GrabTranscript(videoId string) (ret string, err error) {
|
|
|
|
var transcript string
|
|
|
|
if transcript, err = o.GrabTranscriptBase(videoId); err != nil {
|
|
|
|
err = fmt.Errorf("transcript not available. (%v)", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Parse the XML transcript
|
|
|
|
doc := soup.HTMLParse(transcript)
|
|
|
|
// Extract the text content from the <text> tags
|
|
|
|
textTags := doc.FindAll("text")
|
|
|
|
var textBuilder strings.Builder
|
|
|
|
for _, textTag := range textTags {
|
|
|
|
textBuilder.WriteString(textTag.Text())
|
|
|
|
textBuilder.WriteString(" ")
|
|
|
|
ret = textBuilder.String()
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) GrabTranscriptBase(videoId string) (ret string, err error) {
|
|
|
|
if err = o.initService(); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
url := "https://www.youtube.com/watch?v=" + videoId
|
|
|
|
var resp string
|
|
|
|
if resp, err = soup.Get(url); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
doc := soup.HTMLParse(resp)
|
|
|
|
scriptTags := doc.FindAll("script")
|
|
|
|
for _, scriptTag := range scriptTags {
|
|
|
|
if strings.Contains(scriptTag.Text(), "captionTracks") {
|
|
|
|
regex := regexp.MustCompile(`"captionTracks":(\[.*?\])`)
|
|
|
|
match := regex.FindStringSubmatch(scriptTag.Text())
|
|
|
|
if len(match) > 1 {
|
|
|
|
var captionTracks []struct {
|
|
|
|
BaseURL string `json:"baseUrl"`
|
|
|
|
}
|
|
|
|
|
|
|
|
if err = json.Unmarshal([]byte(match[1]), &captionTracks); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(captionTracks) > 0 {
|
|
|
|
transcriptURL := captionTracks[0].BaseURL
|
|
|
|
ret, err = soup.Get(transcriptURL)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
err = fmt.Errorf("transcript not found")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) GrabComments(videoId string) (ret []string, err error) {
|
|
|
|
if err = o.initService(); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
call := o.service.CommentThreads.List([]string{"snippet", "replies"}).VideoId(videoId).TextFormat("plainText").MaxResults(100)
|
|
|
|
var response *youtube.CommentThreadListResponse
|
|
|
|
if response, err = call.Do(); err != nil {
|
|
|
|
log.Printf("Failed to fetch comments: %v", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, item := range response.Items {
|
|
|
|
topLevelComment := item.Snippet.TopLevelComment.Snippet.TextDisplay
|
|
|
|
ret = append(ret, topLevelComment)
|
|
|
|
|
|
|
|
if item.Replies != nil {
|
|
|
|
for _, reply := range item.Replies.Comments {
|
|
|
|
replyText := reply.Snippet.TextDisplay
|
|
|
|
ret = append(ret, " - "+replyText)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) GrabDurationForUrl(url string) (ret int, err error) {
|
|
|
|
if err = o.initService(); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
var videoId string
|
|
|
|
if videoId, err = o.GetVideoId(url); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
return o.GrabDuration(videoId)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) GrabDuration(videoId string) (ret int, err error) {
|
|
|
|
var videoResponse *youtube.VideoListResponse
|
|
|
|
if videoResponse, err = o.service.Videos.List([]string{"contentDetails"}).Id(videoId).Do(); err != nil {
|
|
|
|
err = fmt.Errorf("error getting video details: %v", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
durationStr := videoResponse.Items[0].ContentDetails.Duration
|
|
|
|
|
|
|
|
matches := regexp.MustCompile(`(?i)PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?`).FindStringSubmatch(durationStr)
|
|
|
|
if len(matches) == 0 {
|
|
|
|
return 0, fmt.Errorf("invalid duration string: %s", durationStr)
|
|
|
|
}
|
|
|
|
|
|
|
|
hours, _ := strconv.Atoi(matches[1])
|
|
|
|
minutes, _ := strconv.Atoi(matches[2])
|
|
|
|
seconds, _ := strconv.Atoi(matches[3])
|
|
|
|
|
|
|
|
ret = hours*60 + minutes + seconds/60
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) Grab(url string, options *Options) (ret *VideoInfo, err error) {
|
|
|
|
var videoId string
|
|
|
|
if videoId, err = o.GetVideoId(url); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = &VideoInfo{}
|
|
|
|
|
|
|
|
if options.Duration {
|
|
|
|
if ret.Duration, err = o.GrabDuration(videoId); err != nil {
|
|
|
|
err = fmt.Errorf("error parsing video duration: %v", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if options.Comments {
|
|
|
|
if ret.Comments, err = o.GrabComments(videoId); err != nil {
|
|
|
|
err = fmt.Errorf("error getting comments: %v", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if options.Transcript {
|
|
|
|
if ret.Transcript, err = o.GrabTranscript(videoId); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
type Options struct {
|
|
|
|
Duration bool
|
|
|
|
Transcript bool
|
|
|
|
Comments bool
|
|
|
|
Lang string
|
|
|
|
}
|
|
|
|
|
|
|
|
type VideoInfo struct {
|
|
|
|
Transcript string `json:"transcript"`
|
|
|
|
Duration int `json:"duration"`
|
|
|
|
Comments []string `json:"comments"`
|
|
|
|
}
|
|
|
|
|
|
|
|
func (o *YouTube) GrabByFlags() (ret *VideoInfo, err error) {
|
|
|
|
options := &Options{}
|
|
|
|
flag.BoolVar(&options.Duration, "duration", false, "Output only the duration")
|
|
|
|
flag.BoolVar(&options.Transcript, "transcript", false, "Output only the transcript")
|
|
|
|
flag.BoolVar(&options.Comments, "comments", false, "Output the comments on the video")
|
|
|
|
flag.StringVar(&options.Lang, "lang", "en", "Language for the transcript (default: English)")
|
|
|
|
flag.Parse()
|
|
|
|
|
|
|
|
if flag.NArg() == 0 {
|
|
|
|
log.Fatal("Error: No URL provided.")
|
|
|
|
}
|
|
|
|
|
|
|
|
url := flag.Arg(0)
|
|
|
|
ret, err = o.Grab(url, options)
|
|
|
|
return
|
2024-08-16 22:29:21 +00:00
|
|
|
}
|