Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions MServer-Config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,20 @@ maximumRequestsPerSecond: 999.0
# If set only these Sender will be crawled all other will be ignored.
senderIncluded:
#- ARD
#- ARTE_DE
#- ARGE_FR
#- ARTE_EN
- ARTE_DE
#- ARTE_FR
#- ARTE_PL
#- ARTE_IT
#- ARTE_ES
#- DREISAT
#- FUNK
#- KIKA
# - DW
# - ORF
#- ORF
#- PHOENIX
#- SRF
#- SR
- ZDF
#- ZDF

#SRF,SR,PHONIX,ORF,KIKA,DW,3SAT<

Expand Down Expand Up @@ -130,7 +129,7 @@ maximumCrawlDurationInMinutes: 120

# Enables the topics search
# maximumSubpages limits the depth of the topics search
topicsSearchEnabled: false
topicsSearchEnabled: true

# The maximum amount of sub pages to be crawled.<br>
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and
Expand Down Expand Up @@ -159,19 +158,23 @@ senderConfigurations:
ORF:
maximumRequestsPerSecond: 10.0
ARTE_DE:
maximumUrlsPerTask: 1
maximumDaysForSendungVerpasstSectionFuture: 0
maximumRequestsPerSecond: 2.0
maximumUrlsPerTask: 5
maximumSubpages: 100
ARTE_FR:
maximumDaysForSendungVerpasstSectionFuture: 0
# The maximum amount of URLs to be processed per task.
# maximumUrlsPerTask: 25
# The maximum duration in minutes a crawler may run.
# maximumCrawlDurationInMinutes: 30
# The maximum amount of sub pages to be crawled.<br>
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and
# the amount set by this is 5 then the crawler crawls pages 1 to 5.
# maximumSubpages: 3
maximumUrlsPerTask: 25
maximumSubpages: 100
ARTE_EN:
maximumUrlsPerTask: 25
maximumSubpages: 100
ARTE_PL:
maximumUrlsPerTask: 25
maximumSubpages: 100
ARTE_IT:
maximumUrlsPerTask: 25
maximumSubpages: 100
ARTE_ES:
maximumUrlsPerTask: 25
maximumSubpages: 100
KIKA:
maximumSubpages: 2
maximumRequestsPerSecond: 8.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
import de.mediathekview.mserver.base.uploader.copy.FileCopyTask;
import de.mediathekview.mserver.base.utils.CheckUrlAvailability;
import de.mediathekview.mserver.crawler.ard.ArdCrawler;
import de.mediathekview.mserver.crawler.arte.*;
import de.mediathekview.mserver.crawler.arte.ArteCrawler;
import de.mediathekview.mserver.crawler.arte.ArteCrawler_EN;
import de.mediathekview.mserver.crawler.arte.ArteCrawler_ES;
import de.mediathekview.mserver.crawler.arte.ArteCrawler_FR;
import de.mediathekview.mserver.crawler.arte.ArtetCrawler_IT;
import de.mediathekview.mserver.crawler.arte.ArteCrawler_PL;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.IgnoreFilmFilter;
import de.mediathekview.mserver.crawler.basic.TimeoutTask;
Expand Down Expand Up @@ -504,7 +509,7 @@ private void initializeCrawler(final MServerConfigManager rootConfig) {
new ArteCrawler_PL(forkJoinPool, messageListeners, progressListeners, rootConfig));
crawlerMap.put(
Sender.ARTE_IT,
new ArteCrawler_IT(forkJoinPool, messageListeners, progressListeners, rootConfig));
new ArtetCrawler_IT(forkJoinPool, messageListeners, progressListeners, rootConfig));
crawlerMap.put(
Sender.ARTE_ES,
new ArteCrawler_ES(forkJoinPool, messageListeners, progressListeners, rootConfig));
Expand All @@ -519,8 +524,6 @@ private void initializeCrawler(final MServerConfigManager rootConfig) {
new KikaApiCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig));
crawlerMap.put(
Sender.DW, new DwCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig));
//crawlerMap.put(
// Sender.ORF, new OrfCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig));
crawlerMap.put(
Sender.ORF, new OrfOnCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig));
crawlerMap.put(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,29 +1,9 @@
package de.mediathekview.mserver.crawler.arte;

public class ArteConstants {

public static final String BASE_URL_WWW = "https://www.arte.tv";

public static final String DAY_PAGE_URL =
BASE_URL_WWW + "/api/rproxy/emac/v3/%s/web/pages/TV_GUIDE/?day=%s";

public static final int SUBCATEGORY_LIMIT = 100;
public static final String URL_SUBCATEGORIES =
"https://api.arte.tv/api/opa/v3/subcategories?language=%s&limit="+SUBCATEGORY_LIMIT;
public static final String URL_SUBCATEGORY_VIDEOS =
"%s/api/rproxy/emac/v3/%s/web/data/MOST_RECENT_SUBCATEGORY/?subCategoryCode=%s&page=%s&limit="+SUBCATEGORY_LIMIT;
public static final String URL_VIDEO_LIST =
"%s/api/rproxy/emac/v3/%s/web/data/VIDEO_LISTING/?imageFormats=landscape&authorizedAreas=DE_FR,EUR_DE_FR,SAT,ALL&videoType=%s&imageWithText=true&page=%s&limit=100";

public static final String VIDEO_LIST_TYPE_RECENT = "MOST_RECENT";
public static final String VIDEO_LIST_TYPE_LAST_CHANCE = "LAST_CHANCE";

public static final String URL_FILM_DETAILS = "https://api.arte.tv/api/opa/v3/programs/%s/%s";
public static final String URL_FILM_VIDEOS =
"https://api.arte.tv/api/player/v1/config/%s/%s?platform=ARTE_NEXT";

public static final String AUTH_TOKEN =
"Bearer Nzc1Yjc1ZjJkYjk1NWFhN2I2MWEwMmRlMzAzNjI5NmU3NWU3ODg4ODJjOWMxNTMxYzEzZGRjYjg2ZGE4MmIwOA";

public final static String VIDEOS_URL ="https://api.arte.tv/api/opa/v3/videos?limit=100&page=%s&sort=-creationDate&language=%s";
public final static String VIDEO_URL ="https://www.arte.tv/hbbtvv2/services/web/index.php/OPA/v3/streams/%s/%s/%s"; //PROGRAMID/KIND/LANG
public final static String API_TOKEN = "Bearer Nzc1Yjc1ZjJkYjk1NWFhN2I2MWEwMmRlMzAzNjI5NmU3NWU3ODg4ODJjOWMxNTMxYzEzZGRjYjg2ZGE4MmIwOA";
private ArteConstants() {}

}
171 changes: 40 additions & 131 deletions src/main/java/de/mediathekview/mserver/crawler/arte/ArteCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,25 @@
import de.mediathekview.mlib.messages.listener.MessageListener;
import de.mediathekview.mserver.base.config.MServerConfigManager;
import de.mediathekview.mserver.base.messages.ServerMessages;
import de.mediathekview.mserver.crawler.arte.tasks.*;
import de.mediathekview.mserver.crawler.arte.json.ArteVideoInfoDto;
import de.mediathekview.mserver.crawler.arte.tasks.ArteDtoVideo2FilmTask;
import de.mediathekview.mserver.crawler.arte.tasks.ArteVideoInfoTask;
import de.mediathekview.mserver.crawler.arte.tasks.ArteVideoLinkTask;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import de.mediathekview.mserver.progress.listeners.SenderProgressListener;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
import java.util.Collection;
import java.util.HashSet;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.RecursiveTask;

public class ArteCrawler extends AbstractCrawler {

private static final Logger LOG = LogManager.getLogger(ArteCrawler.class);
private static final DateTimeFormatter SENDUNG_VERPASST_DATEFORMATTER =
DateTimeFormatter.ofPattern("yyyy-MM-dd");

public ArteCrawler(
final ForkJoinPool aForkJoinPool,
Expand All @@ -43,138 +37,53 @@ public ArteCrawler(
public Sender getSender() {
return Sender.ARTE_DE;
}

private Queue<CrawlerUrlDTO> generateSendungVerpasstUrls(ArteLanguage language) {
final Queue<CrawlerUrlDTO> sendungVerpasstUrls = new ConcurrentLinkedQueue<>();
for (int i = 0;
i
< crawlerConfig.getMaximumDaysForSendungVerpasstSection()
+ crawlerConfig.getMaximumDaysForSendungVerpasstSectionFuture();
i++) {
sendungVerpasstUrls.add(
new CrawlerUrlDTO(
String.format(
ArteConstants.DAY_PAGE_URL,
language.getLanguageCode().toLowerCase(),
LocalDateTime.now()
.plus(
crawlerConfig.getMaximumDaysForSendungVerpasstSectionFuture(),
ChronoUnit.DAYS)
.minus(i, ChronoUnit.DAYS)
.format(SENDUNG_VERPASST_DATEFORMATTER))));
}
return sendungVerpasstUrls;
}

private Set<ArteFilmUrlDto> getCategoriesEntries(ArteLanguage language)
throws ExecutionException, InterruptedException {
final ArteSubcategoriesTask subcategoriesTask =
new ArteSubcategoriesTask(this, createTopicsOverviewUrl(language));

final Queue<TopicUrlDTO> subcategoriesUrl = new ConcurrentLinkedQueue<>();
subcategoriesUrl.addAll(forkJoinPool.submit(subcategoriesTask).get());

final ArteSubcategoryVideosTask subcategoryVideosTask =
new ArteSubcategoryVideosTask(
this, subcategoriesUrl, ArteConstants.BASE_URL_WWW, language);
final Set<ArteFilmUrlDto> filmInfos = forkJoinPool.submit(subcategoryVideosTask).get();

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), filmInfos.size());

return filmInfos;
}

private Queue<CrawlerUrlDTO> createTopicsOverviewUrl(ArteLanguage language) {
final Queue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();

final String url =
String.format(ArteConstants.URL_SUBCATEGORIES, language.getLanguageCode().toLowerCase());

urls.add(new CrawlerUrlDTO(url));

return urls;
}

private Set<ArteFilmUrlDto> getVideoListVideos(ArteLanguage language, String videoListType)
throws ExecutionException, InterruptedException {
final ArteAllVideosTask videosTask =
new ArteAllVideosTask(this, createVideoListUrls(language, videoListType), language);
final Set<ArteFilmUrlDto> filmInfos = forkJoinPool.submit(videosTask).get();

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), filmInfos.size());

return filmInfos;
}

private Queue<CrawlerUrlDTO> createVideoListUrls(ArteLanguage language, String videoListType) {
final Queue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();

for (int i = 1; i <= getCrawlerConfig().getMaximumSubpages(); i++) {
final String url =
String.format(
ArteConstants.URL_VIDEO_LIST,
ArteConstants.BASE_URL_WWW,
language.getLanguageCode().toLowerCase(),
videoListType,
i);

urls.add(new CrawlerUrlDTO(url));
}
return urls;
}

private Set<ArteFilmUrlDto> getDaysEntries(ArteLanguage language) throws InterruptedException, ExecutionException {

final ArteDayPageTask dayTask =
new ArteDayPageTask(this, generateSendungVerpasstUrls(language), language);
final Set<ArteFilmUrlDto> shows = forkJoinPool.submit(dayTask).get();

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());

return shows;

protected ArteLanguage getLanguage() {
return ArteLanguage.DE;
}

@Override
protected RecursiveTask<Set<Film>> createCrawlerTask() {
final ArteLanguage language = getLanguage();
try {
final Set<ArteFilmUrlDto> shows = new HashSet<>();
if (isDayEntriesEnabled()) {
shows.addAll(getDaysEntries(language));
}
getVideoListVideos(language, ArteConstants.VIDEO_LIST_TYPE_RECENT).forEach(shows::add);

if (Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled())) {
getCategoriesEntries(language).forEach(shows::add);

getVideoListVideos(language, ArteConstants.VIDEO_LIST_TYPE_LAST_CHANCE)
.forEach(shows::add);
}

try {
// DO NOT overload - maximumUrlsPerTask used to reduce threads to 4
final ArteVideoInfoTask aArteRestVideoInfoTask = new ArteVideoInfoTask(this, createAllVideosQueue());
final Queue<ArteVideoInfoDto> videos = new ConcurrentLinkedQueue<>();
videos.addAll(aArteRestVideoInfoTask.fork().join());
//
printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
getAndSetMaxCount(shows.size());

ServerMessages.DEBUG_ALL_SENDUNG_COUNT, getSender().getName(), videos.size());
getAndSetMaxCount(videos.size());
updateProgress();
//
final Queue<ArteVideoInfoDto> videosWithLink = new ConcurrentLinkedQueue<>();
final ArteVideoLinkTask aArteRestVideosTask = new ArteVideoLinkTask(this, videos);
videosWithLink.addAll(aArteRestVideosTask.fork().join());
//
printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), videosWithLink.size());
getAndSetMaxCount(videosWithLink.size());
updateProgress();
return new ArteFilmTask(
this, new ConcurrentLinkedQueue<>(shows), getSender(), LocalDateTime.now());
} catch (final InterruptedException ex) {
LOG.debug("{} crawler interrupted.", getSender().getName(), ex);
Thread.currentThread().interrupt();
} catch (final ExecutionException ex) {
LOG.fatal("Exception in {} crawler.", getSender().getName(), ex);
//
return new ArteDtoVideo2FilmTask(this, new ConcurrentLinkedQueue<>(videosWithLink));

} catch (final Exception ex) {
LOG.fatal("Exception in {} crawler.", getSender(), ex);
}
return null;
}

protected boolean isDayEntriesEnabled() {
return true;
private Queue<TopicUrlDTO> createAllVideosQueue() {
int maxPages = getCrawlerConfig().getMaximumSubpages();
if (maxPages > 99) {
maxPages = 100;
}
final Queue<TopicUrlDTO> root = new ConcurrentLinkedQueue<>();
for(int pages = 1; pages < maxPages; pages++) {
String rootUrl = String.format(ArteConstants.VIDEOS_URL,pages, getLanguage().toString().toLowerCase());
root.add(new TopicUrlDTO("all videos " + pages,rootUrl));
}
return root;
}

protected ArteLanguage getLanguage() {
return ArteLanguage.DE;
}
}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,4 @@ protected ArteLanguage getLanguage() {
return ArteLanguage.EN;
}

@Override
protected boolean isDayEntriesEnabled() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,4 @@ protected ArteLanguage getLanguage() {
return ArteLanguage.ES;
}

@Override
protected boolean isDayEntriesEnabled() {
return false;
}
}
Loading