Skip to content

Commit ed84149

Browse files
authored
new ARTE crawler
1 parent 4bb1c2a commit ed84149

File tree

91 files changed

+8869
-16372
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+8869
-16372
lines changed

MServer-Config.yaml

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,21 @@ maximumRequestsPerSecond: 999.0
1818
# If set only these Sender will be crawled all other will be ignored.
1919
senderIncluded:
2020
#- ARD
21-
#- ARTE_DE
22-
#- ARGE_FR
23-
#- ARTE_EN
24-
#- ARTE_PL
25-
#- ARTE_IT
26-
#- ARTE_ES
21+
- ARTE_DE
22+
- ARTE_FR
23+
- ARTE_PL
24+
- ARTE_IT
25+
- ARTE_ES
26+
- ARTE_EN
2727
#- DREISAT
2828
#- FUNK
2929
#- KIKA
3030
# - DW
31-
# - ORF
31+
#- ORF
3232
#- PHOENIX
3333
#- SRF
3434
#- SR
35-
- ZDF
35+
#- ZDF
3636

3737
#SRF,SR,PHONIX,ORF,KIKA,DW,3SAT<
3838

@@ -130,7 +130,7 @@ maximumCrawlDurationInMinutes: 120
130130

131131
# Enables the topics search
132132
# maximumSubpages limits the depth of the topics search
133-
topicsSearchEnabled: false
133+
topicsSearchEnabled: true
134134

135135
# The maximum amount of sub pages to be crawled.<br>
136136
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and
@@ -159,19 +159,17 @@ senderConfigurations:
159159
ORF:
160160
maximumRequestsPerSecond: 10.0
161161
ARTE_DE:
162-
maximumUrlsPerTask: 1
163-
maximumDaysForSendungVerpasstSectionFuture: 0
164-
maximumRequestsPerSecond: 2.0
162+
maximumSubpages: 200
165163
ARTE_FR:
166-
maximumDaysForSendungVerpasstSectionFuture: 0
167-
# The maximum amount of URLs to be processed per task.
168-
# maximumUrlsPerTask: 25
169-
# The maximum duration in minutes a crawler may run.
170-
# maximumCrawlDurationInMinutes: 30
171-
# The maximum amount of sub pages to be crawled.<br>
172-
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and
173-
# the amount set by this is 5 then the crawler crawls pages 1 to 5.
174-
# maximumSubpages: 3
164+
maximumSubpages: 200
165+
ARTE_EN:
166+
maximumSubpages: 200
167+
ARTE_PL:
168+
maximumSubpages: 200
169+
ARTE_IT:
170+
maximumSubpages: 200
171+
ARTE_ES:
172+
maximumSubpages: 200
175173
KIKA:
176174
maximumSubpages: 2
177175
maximumRequestsPerSecond: 8.0

src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@
1616
import de.mediathekview.mserver.base.uploader.copy.FileCopyTask;
1717
import de.mediathekview.mserver.base.utils.CheckUrlAvailability;
1818
import de.mediathekview.mserver.crawler.ard.ArdCrawler;
19-
import de.mediathekview.mserver.crawler.arte.*;
19+
import de.mediathekview.mserver.crawler.arte.ArteCrawler;
20+
import de.mediathekview.mserver.crawler.arte.ArteCrawler_EN;
21+
import de.mediathekview.mserver.crawler.arte.ArteCrawler_ES;
22+
import de.mediathekview.mserver.crawler.arte.ArteCrawler_FR;
23+
import de.mediathekview.mserver.crawler.arte.ArteCrawler_IT;
24+
import de.mediathekview.mserver.crawler.arte.ArteCrawler_PL;
2025
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
2126
import de.mediathekview.mserver.crawler.basic.IgnoreFilmFilter;
2227
import de.mediathekview.mserver.crawler.basic.TimeoutTask;
Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,10 @@
11
package de.mediathekview.mserver.crawler.arte;
22

33
public class ArteConstants {
4-
5-
public static final String BASE_URL_WWW = "https://www.arte.tv";
6-
7-
public static final String DAY_PAGE_URL =
8-
BASE_URL_WWW + "/api/rproxy/emac/v3/%s/web/pages/TV_GUIDE/?day=%s";
9-
10-
public static final int SUBCATEGORY_LIMIT = 100;
11-
public static final String URL_SUBCATEGORIES =
12-
"https://api.arte.tv/api/opa/v3/subcategories?language=%s&limit="+SUBCATEGORY_LIMIT;
13-
public static final String URL_SUBCATEGORY_VIDEOS =
14-
"%s/api/rproxy/emac/v3/%s/web/data/MOST_RECENT_SUBCATEGORY/?subCategoryCode=%s&page=%s&limit="+SUBCATEGORY_LIMIT;
15-
public static final String URL_VIDEO_LIST =
16-
"%s/api/rproxy/emac/v3/%s/web/data/VIDEO_LISTING/?imageFormats=landscape&authorizedAreas=DE_FR,EUR_DE_FR,SAT,ALL&videoType=%s&imageWithText=true&page=%s&limit=100";
17-
18-
public static final String VIDEO_LIST_TYPE_RECENT = "MOST_RECENT";
19-
public static final String VIDEO_LIST_TYPE_LAST_CHANCE = "LAST_CHANCE";
20-
21-
public static final String URL_FILM_DETAILS = "https://api.arte.tv/api/opa/v3/programs/%s/%s";
22-
public static final String URL_FILM_VIDEOS =
23-
"https://api.arte.tv/api/player/v1/config/%s/%s?platform=ARTE_NEXT";
24-
25-
public static final String AUTH_TOKEN =
26-
"Bearer Nzc1Yjc1ZjJkYjk1NWFhN2I2MWEwMmRlMzAzNjI5NmU3NWU3ODg4ODJjOWMxNTMxYzEzZGRjYjg2ZGE4MmIwOA";
27-
4+
public static final String VIDEOS_URL ="https://api.arte.tv/api/opa/v3/videos?limit=100&page=%s&sort=-creationDate&language=%s";
5+
public static final String VIDEOS_URL_ALT ="https://api.arte.tv/api/opa/v3/videos?limit=100&page=%s&sort=creationDate&language=%s";
6+
public static final String VIDEO_URL ="https://www.arte.tv/hbbtvv2/services/web/index.php/OPA/v3/streams/%s/%s/%s"; //PROGRAMID/KIND/LANG
7+
public static final String API_TOKEN = "Bearer Nzc1Yjc1ZjJkYjk1NWFhN2I2MWEwMmRlMzAzNjI5NmU3NWU3ODg4ODJjOWMxNTMxYzEzZGRjYjg2ZGE4MmIwOA";
288
private ArteConstants() {}
9+
2910
}

src/main/java/de/mediathekview/mserver/crawler/arte/ArteCrawler.java

Lines changed: 70 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,31 @@
55
import de.mediathekview.mserver.base.messages.listener.MessageListener;
66
import de.mediathekview.mserver.base.config.MServerConfigManager;
77
import de.mediathekview.mserver.base.messages.ServerMessages;
8-
import de.mediathekview.mserver.crawler.arte.tasks.*;
8+
import de.mediathekview.mserver.base.utils.JsonUtils;
9+
import de.mediathekview.mserver.crawler.arte.json.ArteVideoInfoDto;
10+
import de.mediathekview.mserver.crawler.arte.tasks.ArteDtoVideo2FilmTask;
11+
import de.mediathekview.mserver.crawler.arte.tasks.ArteVideoInfoTask;
12+
import de.mediathekview.mserver.crawler.arte.tasks.ArteVideoLinkTask;
913
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
10-
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
1114
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
1215
import de.mediathekview.mserver.progress.listeners.SenderProgressListener;
1316
import org.apache.logging.log4j.LogManager;
1417
import org.apache.logging.log4j.Logger;
1518

16-
import java.time.LocalDateTime;
17-
import java.time.format.DateTimeFormatter;
18-
import java.time.temporal.ChronoUnit;
19+
import com.google.gson.JsonElement;
20+
21+
import java.io.IOException;
1922
import java.util.Collection;
20-
import java.util.HashSet;
23+
import java.util.Map;
24+
import java.util.Optional;
2125
import java.util.Queue;
2226
import java.util.Set;
2327
import java.util.concurrent.ConcurrentLinkedQueue;
24-
import java.util.concurrent.ExecutionException;
2528
import java.util.concurrent.ForkJoinPool;
2629
import java.util.concurrent.RecursiveTask;
2730

2831
public class ArteCrawler extends AbstractCrawler {
29-
3032
private static final Logger LOG = LogManager.getLogger(ArteCrawler.class);
31-
private static final DateTimeFormatter SENDUNG_VERPASST_DATEFORMATTER =
32-
DateTimeFormatter.ofPattern("yyyy-MM-dd");
3333

3434
public ArteCrawler(
3535
final ForkJoinPool aForkJoinPool,
@@ -43,138 +43,76 @@ public ArteCrawler(
4343
public Sender getSender() {
4444
return Sender.ARTE_DE;
4545
}
46-
47-
private Queue<CrawlerUrlDTO> generateSendungVerpasstUrls(ArteLanguage language) {
48-
final Queue<CrawlerUrlDTO> sendungVerpasstUrls = new ConcurrentLinkedQueue<>();
49-
for (int i = 0;
50-
i
51-
< crawlerConfig.getMaximumDaysForSendungVerpasstSection()
52-
+ crawlerConfig.getMaximumDaysForSendungVerpasstSectionFuture();
53-
i++) {
54-
sendungVerpasstUrls.add(
55-
new CrawlerUrlDTO(
56-
String.format(
57-
ArteConstants.DAY_PAGE_URL,
58-
language.getLanguageCode().toLowerCase(),
59-
LocalDateTime.now()
60-
.plus(
61-
crawlerConfig.getMaximumDaysForSendungVerpasstSectionFuture(),
62-
ChronoUnit.DAYS)
63-
.minus(i, ChronoUnit.DAYS)
64-
.format(SENDUNG_VERPASST_DATEFORMATTER))));
65-
}
66-
return sendungVerpasstUrls;
67-
}
68-
69-
private Set<ArteFilmUrlDto> getCategoriesEntries(ArteLanguage language)
70-
throws ExecutionException, InterruptedException {
71-
final ArteSubcategoriesTask subcategoriesTask =
72-
new ArteSubcategoriesTask(this, createTopicsOverviewUrl(language));
73-
74-
final Queue<TopicUrlDTO> subcategoriesUrl = new ConcurrentLinkedQueue<>();
75-
subcategoriesUrl.addAll(forkJoinPool.submit(subcategoriesTask).get());
76-
77-
final ArteSubcategoryVideosTask subcategoryVideosTask =
78-
new ArteSubcategoryVideosTask(
79-
this, subcategoriesUrl, ArteConstants.BASE_URL_WWW, language);
80-
final Set<ArteFilmUrlDto> filmInfos = forkJoinPool.submit(subcategoryVideosTask).get();
81-
82-
printMessage(
83-
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), filmInfos.size());
84-
85-
return filmInfos;
86-
}
87-
88-
private Queue<CrawlerUrlDTO> createTopicsOverviewUrl(ArteLanguage language) {
89-
final Queue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();
90-
91-
final String url =
92-
String.format(ArteConstants.URL_SUBCATEGORIES, language.getLanguageCode().toLowerCase());
93-
94-
urls.add(new CrawlerUrlDTO(url));
95-
96-
return urls;
97-
}
98-
99-
private Set<ArteFilmUrlDto> getVideoListVideos(ArteLanguage language, String videoListType)
100-
throws ExecutionException, InterruptedException {
101-
final ArteAllVideosTask videosTask =
102-
new ArteAllVideosTask(this, createVideoListUrls(language, videoListType), language);
103-
final Set<ArteFilmUrlDto> filmInfos = forkJoinPool.submit(videosTask).get();
104-
105-
printMessage(
106-
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), filmInfos.size());
107-
108-
return filmInfos;
109-
}
110-
111-
private Queue<CrawlerUrlDTO> createVideoListUrls(ArteLanguage language, String videoListType) {
112-
final Queue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();
113-
114-
for (int i = 1; i <= getCrawlerConfig().getMaximumSubpages(); i++) {
115-
final String url =
116-
String.format(
117-
ArteConstants.URL_VIDEO_LIST,
118-
ArteConstants.BASE_URL_WWW,
119-
language.getLanguageCode().toLowerCase(),
120-
videoListType,
121-
i);
122-
123-
urls.add(new CrawlerUrlDTO(url));
124-
}
125-
return urls;
126-
}
127-
128-
private Set<ArteFilmUrlDto> getDaysEntries(ArteLanguage language) throws InterruptedException, ExecutionException {
129-
130-
final ArteDayPageTask dayTask =
131-
new ArteDayPageTask(this, generateSendungVerpasstUrls(language), language);
132-
final Set<ArteFilmUrlDto> shows = forkJoinPool.submit(dayTask).get();
133-
134-
printMessage(
135-
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
136-
137-
return shows;
46+
47+
protected ArteLanguage getLanguage() {
48+
return ArteLanguage.DE;
13849
}
13950

14051
@Override
14152
protected RecursiveTask<Set<Film>> createCrawlerTask() {
142-
final ArteLanguage language = getLanguage();
143-
try {
144-
final Set<ArteFilmUrlDto> shows = new HashSet<>();
145-
if (isDayEntriesEnabled()) {
146-
shows.addAll(getDaysEntries(language));
147-
}
148-
getVideoListVideos(language, ArteConstants.VIDEO_LIST_TYPE_RECENT).forEach(shows::add);
149-
150-
if (Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled())) {
151-
getCategoriesEntries(language).forEach(shows::add);
152-
153-
getVideoListVideos(language, ArteConstants.VIDEO_LIST_TYPE_LAST_CHANCE)
154-
.forEach(shows::add);
155-
}
15653

54+
try {
55+
final ArteVideoInfoTask aArteRestVideoInfoTask;
56+
// DO NOT overload - maximumUrlsPerTask used to reduce threads to 4
57+
aArteRestVideoInfoTask = new ArteVideoInfoTask(this, createVideosQueue());
58+
final Queue<ArteVideoInfoDto> videos = new ConcurrentLinkedQueue<>();
59+
videos.addAll(aArteRestVideoInfoTask.fork().join());
60+
//
15761
printMessage(
158-
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
159-
getAndSetMaxCount(shows.size());
160-
62+
ServerMessages.DEBUG_ALL_SENDUNG_COUNT, getSender().getName(), videos.size());
63+
getAndSetMaxCount(videos.size());
16164
updateProgress();
162-
return new ArteFilmTask(
163-
this, new ConcurrentLinkedQueue<>(shows), getSender(), LocalDateTime.now());
164-
} catch (final InterruptedException ex) {
165-
LOG.debug("{} crawler interrupted.", getSender().getName(), ex);
166-
Thread.currentThread().interrupt();
167-
} catch (final ExecutionException ex) {
168-
LOG.fatal("Exception in {} crawler.", getSender().getName(), ex);
65+
//
66+
final Queue<ArteVideoInfoDto> videosWithLink = new ConcurrentLinkedQueue<>();
67+
final ArteVideoLinkTask aArteRestVideosTask = new ArteVideoLinkTask(this, videos);
68+
videosWithLink.addAll(aArteRestVideosTask.fork().join());
69+
//
70+
printMessage(
71+
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), videosWithLink.size());
72+
getAndSetMaxCount(videosWithLink.size());
73+
updateProgress();
74+
//
75+
return new ArteDtoVideo2FilmTask(this, new ConcurrentLinkedQueue<>(videosWithLink));
76+
77+
} catch (final Exception ex) {
78+
LOG.fatal("Exception in {} crawler.", getSender(), ex);
16979
}
17080
return null;
17181
}
172-
173-
protected boolean isDayEntriesEnabled() {
174-
return true;
82+
83+
private Queue<TopicUrlDTO> createVideosQueue() {
84+
int maxPages = getMaxPagesForOverview();
85+
final Queue<TopicUrlDTO> root = new ConcurrentLinkedQueue<>();
86+
String rootUrl = String.format(ArteConstants.VIDEOS_URL, 1, getLanguage().toString().toLowerCase());
87+
root.add(new TopicUrlDTO("all videos1",rootUrl));
88+
if (maxPages >= 100) {
89+
String rootUrl2 = String.format(ArteConstants.VIDEOS_URL_ALT, 1, getLanguage().toString().toLowerCase());
90+
root.add(new TopicUrlDTO("all videos2",rootUrl2));
91+
}
92+
return root;
17593
}
176-
177-
protected ArteLanguage getLanguage() {
178-
return ArteLanguage.DE;
94+
95+
private int getMaxPagesForOverview() {
96+
final int naturalLimit = Math.min(100, getCrawlerConfig().getMaximumSubpages());
97+
String rootUrl = String.format(ArteConstants.VIDEOS_URL, 1, getLanguage().toString().toLowerCase());
98+
String[] path = {"meta", "videos", "pages"};
99+
try {
100+
final Map<String, String> headers = Map.of(
101+
"Accept", "application/json",
102+
"Content-Type", "application/json",
103+
"Authorization", ArteConstants.API_TOKEN
104+
);
105+
JsonElement element = getConnection().requestBodyAsJsonElement(rootUrl, headers);
106+
Optional<Integer> pages = JsonUtils.getElementValueAsInteger(element, path);
107+
if (pages.isPresent()) {
108+
return Math.min(pages.get(), naturalLimit);
109+
}
110+
} catch (IOException e) {
111+
LOG.error("getMaxPagesForOverview", e);
112+
}
113+
return naturalLimit;
179114
}
115+
180116
}
117+
118+

src/main/java/de/mediathekview/mserver/crawler/arte/ArteCrawlerUrlDto.java

Lines changed: 0 additions & 26 deletions
This file was deleted.

src/main/java/de/mediathekview/mserver/crawler/arte/ArteCrawler_EN.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,4 @@ protected ArteLanguage getLanguage() {
2828
return ArteLanguage.EN;
2929
}
3030

31-
@Override
32-
protected boolean isDayEntriesEnabled() {
33-
return false;
34-
}
3531
}

0 commit comments

Comments
 (0)