55import de .mediathekview .mserver .base .messages .listener .MessageListener ;
66import de .mediathekview .mserver .base .config .MServerConfigManager ;
77import de .mediathekview .mserver .base .messages .ServerMessages ;
8- import de .mediathekview .mserver .crawler .arte .tasks .*;
8+ import de .mediathekview .mserver .base .utils .JsonUtils ;
9+ import de .mediathekview .mserver .crawler .arte .json .ArteVideoInfoDto ;
10+ import de .mediathekview .mserver .crawler .arte .tasks .ArteDtoVideo2FilmTask ;
11+ import de .mediathekview .mserver .crawler .arte .tasks .ArteVideoInfoTask ;
12+ import de .mediathekview .mserver .crawler .arte .tasks .ArteVideoLinkTask ;
913import de .mediathekview .mserver .crawler .basic .AbstractCrawler ;
10- import de .mediathekview .mserver .crawler .basic .CrawlerUrlDTO ;
1114import de .mediathekview .mserver .crawler .basic .TopicUrlDTO ;
1215import de .mediathekview .mserver .progress .listeners .SenderProgressListener ;
1316import org .apache .logging .log4j .LogManager ;
1417import org .apache .logging .log4j .Logger ;
1518
16- import java . time . LocalDateTime ;
17- import java . time . format . DateTimeFormatter ;
18- import java .time . temporal . ChronoUnit ;
19+ import com . google . gson . JsonElement ;
20+
21+ import java .io . IOException ;
1922import java .util .Collection ;
20- import java .util .HashSet ;
23+ import java .util .Map ;
24+ import java .util .Optional ;
2125import java .util .Queue ;
2226import java .util .Set ;
2327import java .util .concurrent .ConcurrentLinkedQueue ;
24- import java .util .concurrent .ExecutionException ;
2528import java .util .concurrent .ForkJoinPool ;
2629import java .util .concurrent .RecursiveTask ;
2730
2831public class ArteCrawler extends AbstractCrawler {
29-
3032 private static final Logger LOG = LogManager .getLogger (ArteCrawler .class );
31- private static final DateTimeFormatter SENDUNG_VERPASST_DATEFORMATTER =
32- DateTimeFormatter .ofPattern ("yyyy-MM-dd" );
3333
3434 public ArteCrawler (
3535 final ForkJoinPool aForkJoinPool ,
@@ -43,138 +43,76 @@ public ArteCrawler(
4343 public Sender getSender () {
4444 return Sender .ARTE_DE ;
4545 }
46-
47- private Queue <CrawlerUrlDTO > generateSendungVerpasstUrls (ArteLanguage language ) {
48- final Queue <CrawlerUrlDTO > sendungVerpasstUrls = new ConcurrentLinkedQueue <>();
49- for (int i = 0 ;
50- i
51- < crawlerConfig .getMaximumDaysForSendungVerpasstSection ()
52- + crawlerConfig .getMaximumDaysForSendungVerpasstSectionFuture ();
53- i ++) {
54- sendungVerpasstUrls .add (
55- new CrawlerUrlDTO (
56- String .format (
57- ArteConstants .DAY_PAGE_URL ,
58- language .getLanguageCode ().toLowerCase (),
59- LocalDateTime .now ()
60- .plus (
61- crawlerConfig .getMaximumDaysForSendungVerpasstSectionFuture (),
62- ChronoUnit .DAYS )
63- .minus (i , ChronoUnit .DAYS )
64- .format (SENDUNG_VERPASST_DATEFORMATTER ))));
65- }
66- return sendungVerpasstUrls ;
67- }
68-
69- private Set <ArteFilmUrlDto > getCategoriesEntries (ArteLanguage language )
70- throws ExecutionException , InterruptedException {
71- final ArteSubcategoriesTask subcategoriesTask =
72- new ArteSubcategoriesTask (this , createTopicsOverviewUrl (language ));
73-
74- final Queue <TopicUrlDTO > subcategoriesUrl = new ConcurrentLinkedQueue <>();
75- subcategoriesUrl .addAll (forkJoinPool .submit (subcategoriesTask ).get ());
76-
77- final ArteSubcategoryVideosTask subcategoryVideosTask =
78- new ArteSubcategoryVideosTask (
79- this , subcategoriesUrl , ArteConstants .BASE_URL_WWW , language );
80- final Set <ArteFilmUrlDto > filmInfos = forkJoinPool .submit (subcategoryVideosTask ).get ();
81-
82- printMessage (
83- ServerMessages .DEBUG_ALL_SENDUNG_FOLGEN_COUNT , getSender ().getName (), filmInfos .size ());
84-
85- return filmInfos ;
86- }
87-
88- private Queue <CrawlerUrlDTO > createTopicsOverviewUrl (ArteLanguage language ) {
89- final Queue <CrawlerUrlDTO > urls = new ConcurrentLinkedQueue <>();
90-
91- final String url =
92- String .format (ArteConstants .URL_SUBCATEGORIES , language .getLanguageCode ().toLowerCase ());
93-
94- urls .add (new CrawlerUrlDTO (url ));
95-
96- return urls ;
97- }
98-
99- private Set <ArteFilmUrlDto > getVideoListVideos (ArteLanguage language , String videoListType )
100- throws ExecutionException , InterruptedException {
101- final ArteAllVideosTask videosTask =
102- new ArteAllVideosTask (this , createVideoListUrls (language , videoListType ), language );
103- final Set <ArteFilmUrlDto > filmInfos = forkJoinPool .submit (videosTask ).get ();
104-
105- printMessage (
106- ServerMessages .DEBUG_ALL_SENDUNG_FOLGEN_COUNT , getSender ().getName (), filmInfos .size ());
107-
108- return filmInfos ;
109- }
110-
111- private Queue <CrawlerUrlDTO > createVideoListUrls (ArteLanguage language , String videoListType ) {
112- final Queue <CrawlerUrlDTO > urls = new ConcurrentLinkedQueue <>();
113-
114- for (int i = 1 ; i <= getCrawlerConfig ().getMaximumSubpages (); i ++) {
115- final String url =
116- String .format (
117- ArteConstants .URL_VIDEO_LIST ,
118- ArteConstants .BASE_URL_WWW ,
119- language .getLanguageCode ().toLowerCase (),
120- videoListType ,
121- i );
122-
123- urls .add (new CrawlerUrlDTO (url ));
124- }
125- return urls ;
126- }
127-
128- private Set <ArteFilmUrlDto > getDaysEntries (ArteLanguage language ) throws InterruptedException , ExecutionException {
129-
130- final ArteDayPageTask dayTask =
131- new ArteDayPageTask (this , generateSendungVerpasstUrls (language ), language );
132- final Set <ArteFilmUrlDto > shows = forkJoinPool .submit (dayTask ).get ();
133-
134- printMessage (
135- ServerMessages .DEBUG_ALL_SENDUNG_FOLGEN_COUNT , getSender ().getName (), shows .size ());
136-
137- return shows ;
46+
47+ protected ArteLanguage getLanguage () {
48+ return ArteLanguage .DE ;
13849 }
13950
14051 @ Override
14152 protected RecursiveTask <Set <Film >> createCrawlerTask () {
142- final ArteLanguage language = getLanguage ();
143- try {
144- final Set <ArteFilmUrlDto > shows = new HashSet <>();
145- if (isDayEntriesEnabled ()) {
146- shows .addAll (getDaysEntries (language ));
147- }
148- getVideoListVideos (language , ArteConstants .VIDEO_LIST_TYPE_RECENT ).forEach (shows ::add );
149-
150- if (Boolean .TRUE .equals (crawlerConfig .getTopicsSearchEnabled ())) {
151- getCategoriesEntries (language ).forEach (shows ::add );
152-
153- getVideoListVideos (language , ArteConstants .VIDEO_LIST_TYPE_LAST_CHANCE )
154- .forEach (shows ::add );
155- }
15653
54+ try {
55+ final ArteVideoInfoTask aArteRestVideoInfoTask ;
56+ // DO NOT overload - maximumUrlsPerTask used to reduce threads to 4
57+ aArteRestVideoInfoTask = new ArteVideoInfoTask (this , createVideosQueue ());
58+ final Queue <ArteVideoInfoDto > videos = new ConcurrentLinkedQueue <>();
59+ videos .addAll (aArteRestVideoInfoTask .fork ().join ());
60+ //
15761 printMessage (
158- ServerMessages .DEBUG_ALL_SENDUNG_FOLGEN_COUNT , getSender ().getName (), shows .size ());
159- getAndSetMaxCount (shows .size ());
160-
62+ ServerMessages .DEBUG_ALL_SENDUNG_COUNT , getSender ().getName (), videos .size ());
63+ getAndSetMaxCount (videos .size ());
16164 updateProgress ();
162- return new ArteFilmTask (
163- this , new ConcurrentLinkedQueue <>(shows ), getSender (), LocalDateTime .now ());
164- } catch (final InterruptedException ex ) {
165- LOG .debug ("{} crawler interrupted." , getSender ().getName (), ex );
166- Thread .currentThread ().interrupt ();
167- } catch (final ExecutionException ex ) {
168- LOG .fatal ("Exception in {} crawler." , getSender ().getName (), ex );
65+ //
66+ final Queue <ArteVideoInfoDto > videosWithLink = new ConcurrentLinkedQueue <>();
67+ final ArteVideoLinkTask aArteRestVideosTask = new ArteVideoLinkTask (this , videos );
68+ videosWithLink .addAll (aArteRestVideosTask .fork ().join ());
69+ //
70+ printMessage (
71+ ServerMessages .DEBUG_ALL_SENDUNG_FOLGEN_COUNT , getSender ().getName (), videosWithLink .size ());
72+ getAndSetMaxCount (videosWithLink .size ());
73+ updateProgress ();
74+ //
75+ return new ArteDtoVideo2FilmTask (this , new ConcurrentLinkedQueue <>(videosWithLink ));
76+
77+ } catch (final Exception ex ) {
78+ LOG .fatal ("Exception in {} crawler." , getSender (), ex );
16979 }
17080 return null ;
17181 }
172-
173- protected boolean isDayEntriesEnabled () {
174- return true ;
82+
83+ private Queue <TopicUrlDTO > createVideosQueue () {
84+ int maxPages = getMaxPagesForOverview ();
85+ final Queue <TopicUrlDTO > root = new ConcurrentLinkedQueue <>();
86+ String rootUrl = String .format (ArteConstants .VIDEOS_URL , 1 , getLanguage ().toString ().toLowerCase ());
87+ root .add (new TopicUrlDTO ("all videos1" ,rootUrl ));
88+ if (maxPages >= 100 ) {
89+ String rootUrl2 = String .format (ArteConstants .VIDEOS_URL_ALT , 1 , getLanguage ().toString ().toLowerCase ());
90+ root .add (new TopicUrlDTO ("all videos2" ,rootUrl2 ));
91+ }
92+ return root ;
17593 }
176-
177- protected ArteLanguage getLanguage () {
178- return ArteLanguage .DE ;
94+
95+ private int getMaxPagesForOverview () {
96+ final int naturalLimit = Math .min (100 , getCrawlerConfig ().getMaximumSubpages ());
97+ String rootUrl = String .format (ArteConstants .VIDEOS_URL , 1 , getLanguage ().toString ().toLowerCase ());
98+ String [] path = {"meta" , "videos" , "pages" };
99+ try {
100+ final Map <String , String > headers = Map .of (
101+ "Accept" , "application/json" ,
102+ "Content-Type" , "application/json" ,
103+ "Authorization" , ArteConstants .API_TOKEN
104+ );
105+ JsonElement element = getConnection ().requestBodyAsJsonElement (rootUrl , headers );
106+ Optional <Integer > pages = JsonUtils .getElementValueAsInteger (element , path );
107+ if (pages .isPresent ()) {
108+ return Math .min (pages .get (), naturalLimit );
109+ }
110+ } catch (IOException e ) {
111+ LOG .error ("getMaxPagesForOverview" , e );
112+ }
113+ return naturalLimit ;
179114 }
115+
180116}
117+
118+
0 commit comments