package youtube import ( "context" "encoding/json" "flag" "fmt" "github.com/anaskhan96/soup" "github.com/danielmiessler/fabric/common" "google.golang.org/api/option" "google.golang.org/api/youtube/v3" "log" "regexp" "strconv" "strings" ) func NewYouTube() (ret *YouTube) { label := "YouTube" ret = &YouTube{} ret.Configurable = &common.Configurable{ Label: label, EnvNamePrefix: common.BuildEnvVariablePrefix(label), } ret.ApiKey = ret.AddSetupQuestion("API key", true) return } type YouTube struct { *common.Configurable ApiKey *common.SetupQuestion service *youtube.Service } func (o *YouTube) initService() (err error) { if o.service == nil { ctx := context.Background() o.service, err = youtube.NewService(ctx, option.WithAPIKey(o.ApiKey.Value)) } return } func (o *YouTube) GetVideoId(url string) (ret string, err error) { if err = o.initService(); err != nil { return } pattern := `(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})` re := regexp.MustCompile(pattern) match := re.FindStringSubmatch(url) if len(match) > 1 { ret = match[1] } else { err = fmt.Errorf("invalid YouTube URL, can't get video ID") } return } func (o *YouTube) GrabTranscriptForUrl(url string) (ret string, err error) { var videoId string if videoId, err = o.GetVideoId(url); err != nil { return } return o.GrabTranscript(videoId) } func (o *YouTube) GrabTranscript(videoId string) (ret string, err error) { var transcript string if transcript, err = o.GrabTranscriptBase(videoId); err != nil { err = fmt.Errorf("transcript not available. (%v)", err) return } // Parse the XML transcript doc := soup.HTMLParse(transcript) // Extract the text content from the tags textTags := doc.FindAll("text") var textBuilder strings.Builder for _, textTag := range textTags { textBuilder.WriteString(textTag.Text()) textBuilder.WriteString(" ") ret = textBuilder.String() } return } func (o *YouTube) GrabTranscriptBase(videoId string) (ret string, err error) { if err = o.initService(); err != nil { return } url := "https://www.youtube.com/watch?v=" + videoId var resp string if resp, err = soup.Get(url); err != nil { return } doc := soup.HTMLParse(resp) scriptTags := doc.FindAll("script") for _, scriptTag := range scriptTags { if strings.Contains(scriptTag.Text(), "captionTracks") { regex := regexp.MustCompile(`"captionTracks":(\[.*?\])`) match := regex.FindStringSubmatch(scriptTag.Text()) if len(match) > 1 { var captionTracks []struct { BaseURL string `json:"baseUrl"` } if err = json.Unmarshal([]byte(match[1]), &captionTracks); err != nil { return } if len(captionTracks) > 0 { transcriptURL := captionTracks[0].BaseURL ret, err = soup.Get(transcriptURL) return } } } } err = fmt.Errorf("transcript not found") return } func (o *YouTube) GrabComments(videoId string) (ret []string, err error) { if err = o.initService(); err != nil { return } call := o.service.CommentThreads.List([]string{"snippet", "replies"}).VideoId(videoId).TextFormat("plainText").MaxResults(100) var response *youtube.CommentThreadListResponse if response, err = call.Do(); err != nil { log.Printf("Failed to fetch comments: %v", err) return } for _, item := range response.Items { topLevelComment := item.Snippet.TopLevelComment.Snippet.TextDisplay ret = append(ret, topLevelComment) if item.Replies != nil { for _, reply := range item.Replies.Comments { replyText := reply.Snippet.TextDisplay ret = append(ret, " - "+replyText) } } } return } func (o *YouTube) GrabDurationForUrl(url string) (ret int, err error) { if err = o.initService(); err != nil { return } var videoId string if videoId, err = o.GetVideoId(url); err != nil { return } return o.GrabDuration(videoId) } func (o *YouTube) GrabDuration(videoId string) (ret int, err error) { var videoResponse *youtube.VideoListResponse if videoResponse, err = o.service.Videos.List([]string{"contentDetails"}).Id(videoId).Do(); err != nil { err = fmt.Errorf("error getting video details: %v", err) return } durationStr := videoResponse.Items[0].ContentDetails.Duration matches := regexp.MustCompile(`(?i)PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?`).FindStringSubmatch(durationStr) if len(matches) == 0 { return 0, fmt.Errorf("invalid duration string: %s", durationStr) } hours, _ := strconv.Atoi(matches[1]) minutes, _ := strconv.Atoi(matches[2]) seconds, _ := strconv.Atoi(matches[3]) ret = hours*60 + minutes + seconds/60 return } func (o *YouTube) Grab(url string, options *Options) (ret *VideoInfo, err error) { var videoId string if videoId, err = o.GetVideoId(url); err != nil { return } ret = &VideoInfo{} if options.Duration { if ret.Duration, err = o.GrabDuration(videoId); err != nil { err = fmt.Errorf("error parsing video duration: %v", err) return } } if options.Comments { if ret.Comments, err = o.GrabComments(videoId); err != nil { err = fmt.Errorf("error getting comments: %v", err) return } } if options.Transcript { if ret.Transcript, err = o.GrabTranscript(videoId); err != nil { return } } return } type Options struct { Duration bool Transcript bool Comments bool Lang string } type VideoInfo struct { Transcript string `json:"transcript"` Duration int `json:"duration"` Comments []string `json:"comments"` } func (o *YouTube) GrabByFlags() (ret *VideoInfo, err error) { options := &Options{} flag.BoolVar(&options.Duration, "duration", false, "Output only the duration") flag.BoolVar(&options.Transcript, "transcript", false, "Output only the transcript") flag.BoolVar(&options.Comments, "comments", false, "Output the comments on the video") flag.StringVar(&options.Lang, "lang", "en", "Language for the transcript (default: English)") flag.Parse() if flag.NArg() == 0 { log.Fatal("Error: No URL provided.") } url := flag.Arg(0) ret, err = o.Grab(url, options) return }