Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 19 additions & 21 deletions MServer-Config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ maximumRequestsPerSecond: 999.0
# If set only these Sender will be crawled all other will be ignored.
senderIncluded:
#- ARD
#- ARTE_DE
#- ARGE_FR
#- ARTE_EN
#- ARTE_PL
#- ARTE_IT
#- ARTE_ES
- ARTE_DE
- ARTE_FR
- ARTE_PL
- ARTE_IT
- ARTE_ES
- ARTE_EN
#- DREISAT
#- FUNK
#- KIKA
# - DW
# - ORF
#- ORF
#- PHOENIX
#- SRF
#- SR
- ZDF
#- ZDF

#SRF,SR,PHONIX,ORF,KIKA,DW,3SAT<

Expand Down Expand Up @@ -130,7 +130,7 @@ maximumCrawlDurationInMinutes: 120

# Enables the topics search
# maximumSubpages limits the depth of the topics search
topicsSearchEnabled: false
topicsSearchEnabled: true

# The maximum amount of sub pages to be crawled.<br>
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and
Expand Down Expand Up @@ -159,19 +159,17 @@ senderConfigurations:
ORF:
maximumRequestsPerSecond: 10.0
ARTE_DE:
maximumUrlsPerTask: 1
maximumDaysForSendungVerpasstSectionFuture: 0
maximumRequestsPerSecond: 2.0
maximumSubpages: 200
ARTE_FR:
maximumDaysForSendungVerpasstSectionFuture: 0
# The maximum amount of URLs to be processed per task.
# maximumUrlsPerTask: 25
# The maximum duration in minutes a crawler may run.
# maximumCrawlDurationInMinutes: 30
# The maximum amount of sub pages to be crawled.<br>
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and
# the amount set by this is 5 then the crawler crawls pages 1 to 5.
# maximumSubpages: 3
maximumSubpages: 200
ARTE_EN:
maximumSubpages: 200
ARTE_PL:
maximumSubpages: 200
ARTE_IT:
maximumSubpages: 200
ARTE_ES:
maximumSubpages: 200
KIKA:
maximumSubpages: 2
maximumRequestsPerSecond: 8.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
import de.mediathekview.mserver.base.uploader.copy.FileCopyTask;
import de.mediathekview.mserver.base.utils.CheckUrlAvailability;
import de.mediathekview.mserver.crawler.ard.ArdCrawler;
import de.mediathekview.mserver.crawler.arte.*;
import de.mediathekview.mserver.crawler.arte.ArteCrawler;
import de.mediathekview.mserver.crawler.arte.ArteCrawler_EN;
import de.mediathekview.mserver.crawler.arte.ArteCrawler_ES;
import de.mediathekview.mserver.crawler.arte.ArteCrawler_FR;
import de.mediathekview.mserver.crawler.arte.ArteCrawler_IT;
import de.mediathekview.mserver.crawler.arte.ArteCrawler_PL;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.IgnoreFilmFilter;
import de.mediathekview.mserver.crawler.basic.TimeoutTask;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,29 +1,10 @@
package de.mediathekview.mserver.crawler.arte;

public class ArteConstants {

public static final String BASE_URL_WWW = "https://www.arte.tv";

public static final String DAY_PAGE_URL =
BASE_URL_WWW + "/api/rproxy/emac/v3/%s/web/pages/TV_GUIDE/?day=%s";

public static final int SUBCATEGORY_LIMIT = 100;
public static final String URL_SUBCATEGORIES =
"https://api.arte.tv/api/opa/v3/subcategories?language=%s&limit="+SUBCATEGORY_LIMIT;
public static final String URL_SUBCATEGORY_VIDEOS =
"%s/api/rproxy/emac/v3/%s/web/data/MOST_RECENT_SUBCATEGORY/?subCategoryCode=%s&page=%s&limit="+SUBCATEGORY_LIMIT;
public static final String URL_VIDEO_LIST =
"%s/api/rproxy/emac/v3/%s/web/data/VIDEO_LISTING/?imageFormats=landscape&authorizedAreas=DE_FR,EUR_DE_FR,SAT,ALL&videoType=%s&imageWithText=true&page=%s&limit=100";

public static final String VIDEO_LIST_TYPE_RECENT = "MOST_RECENT";
public static final String VIDEO_LIST_TYPE_LAST_CHANCE = "LAST_CHANCE";

public static final String URL_FILM_DETAILS = "https://api.arte.tv/api/opa/v3/programs/%s/%s";
public static final String URL_FILM_VIDEOS =
"https://api.arte.tv/api/player/v1/config/%s/%s?platform=ARTE_NEXT";

public static final String AUTH_TOKEN =
"Bearer Nzc1Yjc1ZjJkYjk1NWFhN2I2MWEwMmRlMzAzNjI5NmU3NWU3ODg4ODJjOWMxNTMxYzEzZGRjYjg2ZGE4MmIwOA";

public static final String VIDEOS_URL ="https://api.arte.tv/api/opa/v3/videos?limit=100&page=%s&sort=-creationDate&language=%s";
public static final String VIDEOS_URL_ALT ="https://api.arte.tv/api/opa/v3/videos?limit=100&page=%s&sort=creationDate&language=%s";
public static final String VIDEO_URL ="https://www.arte.tv/hbbtvv2/services/web/index.php/OPA/v3/streams/%s/%s/%s"; //PROGRAMID/KIND/LANG
public static final String API_TOKEN = "Bearer Nzc1Yjc1ZjJkYjk1NWFhN2I2MWEwMmRlMzAzNjI5NmU3NWU3ODg4ODJjOWMxNTMxYzEzZGRjYjg2ZGE4MmIwOA";
private ArteConstants() {}

}
202 changes: 70 additions & 132 deletions src/main/java/de/mediathekview/mserver/crawler/arte/ArteCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,31 @@
import de.mediathekview.mserver.base.messages.listener.MessageListener;
import de.mediathekview.mserver.base.config.MServerConfigManager;
import de.mediathekview.mserver.base.messages.ServerMessages;
import de.mediathekview.mserver.crawler.arte.tasks.*;
import de.mediathekview.mserver.base.utils.JsonUtils;
import de.mediathekview.mserver.crawler.arte.json.ArteVideoInfoDto;
import de.mediathekview.mserver.crawler.arte.tasks.ArteDtoVideo2FilmTask;
import de.mediathekview.mserver.crawler.arte.tasks.ArteVideoInfoTask;
import de.mediathekview.mserver.crawler.arte.tasks.ArteVideoLinkTask;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import de.mediathekview.mserver.progress.listeners.SenderProgressListener;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
import com.google.gson.JsonElement;

import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.RecursiveTask;

public class ArteCrawler extends AbstractCrawler {

private static final Logger LOG = LogManager.getLogger(ArteCrawler.class);
private static final DateTimeFormatter SENDUNG_VERPASST_DATEFORMATTER =
DateTimeFormatter.ofPattern("yyyy-MM-dd");

public ArteCrawler(
final ForkJoinPool aForkJoinPool,
Expand All @@ -43,138 +43,76 @@ public ArteCrawler(
public Sender getSender() {
return Sender.ARTE_DE;
}

private Queue<CrawlerUrlDTO> generateSendungVerpasstUrls(ArteLanguage language) {
final Queue<CrawlerUrlDTO> sendungVerpasstUrls = new ConcurrentLinkedQueue<>();
for (int i = 0;
i
< crawlerConfig.getMaximumDaysForSendungVerpasstSection()
+ crawlerConfig.getMaximumDaysForSendungVerpasstSectionFuture();
i++) {
sendungVerpasstUrls.add(
new CrawlerUrlDTO(
String.format(
ArteConstants.DAY_PAGE_URL,
language.getLanguageCode().toLowerCase(),
LocalDateTime.now()
.plus(
crawlerConfig.getMaximumDaysForSendungVerpasstSectionFuture(),
ChronoUnit.DAYS)
.minus(i, ChronoUnit.DAYS)
.format(SENDUNG_VERPASST_DATEFORMATTER))));
}
return sendungVerpasstUrls;
}

private Set<ArteFilmUrlDto> getCategoriesEntries(ArteLanguage language)
throws ExecutionException, InterruptedException {
final ArteSubcategoriesTask subcategoriesTask =
new ArteSubcategoriesTask(this, createTopicsOverviewUrl(language));

final Queue<TopicUrlDTO> subcategoriesUrl = new ConcurrentLinkedQueue<>();
subcategoriesUrl.addAll(forkJoinPool.submit(subcategoriesTask).get());

final ArteSubcategoryVideosTask subcategoryVideosTask =
new ArteSubcategoryVideosTask(
this, subcategoriesUrl, ArteConstants.BASE_URL_WWW, language);
final Set<ArteFilmUrlDto> filmInfos = forkJoinPool.submit(subcategoryVideosTask).get();

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), filmInfos.size());

return filmInfos;
}

private Queue<CrawlerUrlDTO> createTopicsOverviewUrl(ArteLanguage language) {
final Queue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();

final String url =
String.format(ArteConstants.URL_SUBCATEGORIES, language.getLanguageCode().toLowerCase());

urls.add(new CrawlerUrlDTO(url));

return urls;
}

private Set<ArteFilmUrlDto> getVideoListVideos(ArteLanguage language, String videoListType)
throws ExecutionException, InterruptedException {
final ArteAllVideosTask videosTask =
new ArteAllVideosTask(this, createVideoListUrls(language, videoListType), language);
final Set<ArteFilmUrlDto> filmInfos = forkJoinPool.submit(videosTask).get();

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), filmInfos.size());

return filmInfos;
}

private Queue<CrawlerUrlDTO> createVideoListUrls(ArteLanguage language, String videoListType) {
final Queue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();

for (int i = 1; i <= getCrawlerConfig().getMaximumSubpages(); i++) {
final String url =
String.format(
ArteConstants.URL_VIDEO_LIST,
ArteConstants.BASE_URL_WWW,
language.getLanguageCode().toLowerCase(),
videoListType,
i);

urls.add(new CrawlerUrlDTO(url));
}
return urls;
}

private Set<ArteFilmUrlDto> getDaysEntries(ArteLanguage language) throws InterruptedException, ExecutionException {

final ArteDayPageTask dayTask =
new ArteDayPageTask(this, generateSendungVerpasstUrls(language), language);
final Set<ArteFilmUrlDto> shows = forkJoinPool.submit(dayTask).get();

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());

return shows;

protected ArteLanguage getLanguage() {
return ArteLanguage.DE;
}

@Override
protected RecursiveTask<Set<Film>> createCrawlerTask() {
final ArteLanguage language = getLanguage();
try {
final Set<ArteFilmUrlDto> shows = new HashSet<>();
if (isDayEntriesEnabled()) {
shows.addAll(getDaysEntries(language));
}
getVideoListVideos(language, ArteConstants.VIDEO_LIST_TYPE_RECENT).forEach(shows::add);

if (Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled())) {
getCategoriesEntries(language).forEach(shows::add);

getVideoListVideos(language, ArteConstants.VIDEO_LIST_TYPE_LAST_CHANCE)
.forEach(shows::add);
}

try {
final ArteVideoInfoTask aArteRestVideoInfoTask;
// DO NOT overload - maximumUrlsPerTask used to reduce threads to 4
aArteRestVideoInfoTask = new ArteVideoInfoTask(this, createVideosQueue());
final Queue<ArteVideoInfoDto> videos = new ConcurrentLinkedQueue<>();
videos.addAll(aArteRestVideoInfoTask.fork().join());
//
printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
getAndSetMaxCount(shows.size());

ServerMessages.DEBUG_ALL_SENDUNG_COUNT, getSender().getName(), videos.size());
getAndSetMaxCount(videos.size());
updateProgress();
return new ArteFilmTask(
this, new ConcurrentLinkedQueue<>(shows), getSender(), LocalDateTime.now());
} catch (final InterruptedException ex) {
LOG.debug("{} crawler interrupted.", getSender().getName(), ex);
Thread.currentThread().interrupt();
} catch (final ExecutionException ex) {
LOG.fatal("Exception in {} crawler.", getSender().getName(), ex);
//
final Queue<ArteVideoInfoDto> videosWithLink = new ConcurrentLinkedQueue<>();
final ArteVideoLinkTask aArteRestVideosTask = new ArteVideoLinkTask(this, videos);
videosWithLink.addAll(aArteRestVideosTask.fork().join());
//
printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), videosWithLink.size());
getAndSetMaxCount(videosWithLink.size());
updateProgress();
//
return new ArteDtoVideo2FilmTask(this, new ConcurrentLinkedQueue<>(videosWithLink));

} catch (final Exception ex) {
LOG.fatal("Exception in {} crawler.", getSender(), ex);
}
return null;
}

protected boolean isDayEntriesEnabled() {
return true;

private Queue<TopicUrlDTO> createVideosQueue() {
int maxPages = getMaxPagesForOverview();
final Queue<TopicUrlDTO> root = new ConcurrentLinkedQueue<>();
String rootUrl = String.format(ArteConstants.VIDEOS_URL, 1, getLanguage().toString().toLowerCase());
root.add(new TopicUrlDTO("all videos1",rootUrl));
if (maxPages >= 100) {
String rootUrl2 = String.format(ArteConstants.VIDEOS_URL_ALT, 1, getLanguage().toString().toLowerCase());
root.add(new TopicUrlDTO("all videos2",rootUrl2));
}
return root;
}

protected ArteLanguage getLanguage() {
return ArteLanguage.DE;

private int getMaxPagesForOverview() {
final int naturalLimit = Math.min(100, getCrawlerConfig().getMaximumSubpages());
String rootUrl = String.format(ArteConstants.VIDEOS_URL, 1, getLanguage().toString().toLowerCase());
String[] path = {"meta", "videos", "pages"};
try {
final Map<String, String> headers = Map.of(
"Accept", "application/json",
"Content-Type", "application/json",
"Authorization", ArteConstants.API_TOKEN
);
JsonElement element = getConnection().requestBodyAsJsonElement(rootUrl, headers);
Optional<Integer> pages = JsonUtils.getElementValueAsInteger(element, path);
if (pages.isPresent()) {
return Math.min(pages.get(), naturalLimit);
}
} catch (IOException e) {
LOG.error("getMaxPagesForOverview", e);
}
return naturalLimit;
}

}


This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,4 @@ protected ArteLanguage getLanguage() {
return ArteLanguage.EN;
}

@Override
protected boolean isDayEntriesEnabled() {
return false;
}
}
Loading