+package net.wpitchoune.pnews.servlet;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.time.Instant;
+import java.time.temporal.ChronoUnit;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.jsoup.Jsoup;
+
+import com.rometools.rome.feed.synd.SyndEnclosure;
+import com.rometools.rome.feed.synd.SyndEntry;
+import com.rometools.rome.feed.synd.SyndFeed;
+import com.rometools.rome.io.FeedException;
+import com.rometools.rome.io.SyndFeedInput;
+import com.rometools.rome.io.XmlReader;
+
+import net.wpitchoune.pnews.Article;
+import net.wpitchoune.pnews.ArticleStore;
+import net.wpitchoune.pnews.Category;
+import net.wpitchoune.pnews.Config;
+import net.wpitchoune.pnews.EntityStat;
+import net.wpitchoune.pnews.Feed;
+import net.wpitchoune.pnews.classifier.NamedEntityRecognizer;
+
+public class ArticleProvider {
+ private static final String CLASS_NAME = ArticleProvider.class.getName();
+ private static final Logger LOG = Logger.getLogger(CLASS_NAME);
+ private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
+ private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(Runtime.getRuntime().availableProcessors());
+ private final Config config;
+
+ public ArticleProvider(Config config) {
+ this.config = config;
+ for (Category cat: config.getCategories())
+ scheduler.scheduleAtFixedRate(new Refresher(cat), 2, 600, TimeUnit.SECONDS);
+ }
+
+ private static SyndFeed getSyndFeed(String u) throws IllegalArgumentException, FeedException, MalformedURLException, IOException {
+ XmlReader r;
+
+ r = new XmlReader(new URL(u));
+
+ return new SyndFeedInput().build(r);
+ }
+
+ private List<Article> getArticlesForUpdate(Category cat) {
+ List<Article> result;
+
+ synchronized (articlesByCategory) {
+ result = articlesByCategory.get(cat);
+ if (result == null) {
+ result = new ArrayList<>();
+ articlesByCategory.put(cat, result);
+ }
+ return result;
+ }
+ }
+
+ private boolean exists(String articleLink, List<Article> articles) {
+ synchronized (articles) {
+ for (Article a: articles)
+ if (a.getLink().equals(articleLink))
+ return true;
+ }
+ return false;
+ }
+
+ private Instant getArticleInstant(SyndEntry entry) {
+ Date date;
+
+ date = entry.getUpdatedDate();
+ if (date == null)
+ date = entry.getPublishedDate();
+
+ if (date == null)
+ return Instant.now();
+
+ return date.toInstant();
+ }
+
+ private Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang, Instant instant) {
+ String desc, title, thumbnail, feedTitle, str;
+ List<String> entities;
+
+ feedTitle = feed.getTitle();
+ if (feedTitle != null) {
+ feedTitle = feedTitle.trim();
+ }
+
+ thumbnail = null;
+ for (SyndEnclosure e: entry.getEnclosures()) {
+ if (e.getType().startsWith("image/"))
+ thumbnail = e.getUrl();
+ break;
+ }
+
+ title = entry.getTitle().trim();
+
+ if (entry.getDescription() != null) {
+ str = entry.getDescription().getValue();
+ desc = Jsoup.parse(str).text();
+ } else {
+ desc = null;
+ LOG.severe("No description for " + feedTitle + " - " + title);
+ }
+
+ entities = new ArrayList<>();
+ if (lang.equals("en"))
+ try {
+ NamedEntityRecognizer.classify(title, entities, config);
+ if (desc != null)
+ NamedEntityRecognizer.classify(desc, entities, config);
+ } catch (ClassCastException | ClassNotFoundException | IOException e1) {
+ LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);
+ }
+
+ return new Article(link, title, desc, thumbnail, instant, feedTitle, entities.toArray(new String[0]));
+ }
+
+ private void addArticles(Category cat, SyndFeed feed) {
+ String feedTitle;
+ List<Article> articles;
+ Article a;
+
+ feedTitle = feed.getTitle().trim();
+
+ LOG.info("addArticles " + cat.getLabel() + " " + feedTitle + " number of articles: " + feed.getEntries().size());
+
+ for (SyndEntry entry: feed.getEntries()) {
+ String link = entry.getLink().trim();
+ articles = getArticlesForUpdate(cat);
+ if (exists(link, articles)) {
+ LOG.fine("addArticles " + link + " is already present");
+ continue ;
+ }
+
+ final Instant instant = getArticleInstant(entry);
+
+ if (config.isObsolete(instant))
+ continue ;
+
+ a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage(), instant));
+
+ synchronized (articles) {
+ articles.add(a);
+
+ Collections.sort(articles, new Comparator<Article>() {
+ @Override
+ public int compare(Article o1, Article o2) {
+ if (o1.getPublicationDate() == o2.getPublicationDate())
+ return 0;
+ if (o1.getPublicationDate() == null)
+ return 1;
+ if (o2.getPublicationDate() == null)
+ return -1;
+ return o2.getPublicationDate().compareTo(o1.getPublicationDate());
+ }
+ });
+ }
+ }
+
+ LOG.info("addArticles done " + cat.getLabel());
+ }
+
+ private void retrieveArticles(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
+ List<Feed> feeds;
+
+ feeds = config.getFeedsByCategory().get(cat);
+
+ if (feeds != null)
+ for (Feed f: feeds)
+ try {
+ addArticles(cat, getSyndFeed(f.getURL()));
+ } catch (Throwable e) {
+ LOG.log(Level.SEVERE,
+ "retrieveArticles failure " + cat.getLabel() + " " + f.toString(),
+ e);
+ }
+ else
+ LOG.severe("No feed for category " + cat);
+ }
+
+ /**
+ * Returns a copy.
+ */
+ public List<Article> getArticles(Category cat, String entity)
+ throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
+ List<Article> articles, result;
+
+ synchronized (articlesByCategory) {
+ articles = getArticlesForUpdate(cat);
+ }
+
+ synchronized (articles) {
+ if (entity == null)
+ return new ArrayList<>(articles);
+
+ result = new ArrayList<>(articles.size());
+ for (Article a: articles)
+ if (a.hasEntity(entity))
+ result.add(a);
+
+ return result;
+ }
+ }
+
+ public List<EntityStat> getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
+ List<Article> articles;
+ Map<String, EntityStat> entities;
+ final String FUNCTION_NAME = "getEntities";
+ EntityStat s;
+ List<EntityStat> stats;
+ Instant minInstant;
+
+ LOG.entering(CLASS_NAME, FUNCTION_NAME, cat);
+
+ articles = getArticles(cat, null);
+
+ minInstant = Instant.now().minus(15, ChronoUnit.DAYS);
+
+ entities = new HashMap<>();
+ for (Article a: articles)
+ if (a.getPublicationDate().isAfter(minInstant) && a.getEntities() != null)
+ for (String e: a.getEntities()) {
+ s = entities.get(e);
+ if (s == null) {
+ s = new EntityStat(e);
+ entities.put(e, s);
+ }
+ s.increment();
+ }
+
+ stats = new ArrayList<>(entities.values());
+ stats.sort(new Comparator<EntityStat>() {
+
+ @Override
+ public int compare(EntityStat o1, EntityStat o2) {
+ return Integer.compare(o2.getCount(), o1.getCount());
+ }
+
+ });
+
+ LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats);
+
+ return stats;
+ }
+
+ private class Refresher implements Runnable {
+ private final Category category;
+
+ public Refresher(Category category) {
+ this.category = category;
+ }
+
+ @Override
+ public void run() {
+ LOG.info("refresher "+ category.getLabel());
+
+ try {
+ retrieveArticles(category);
+ } catch (IllegalArgumentException | FeedException | IOException e) {
+ LOG.log(Level.SEVERE, "refresher failure", e);
+ }
+
+ LOG.info("refresher "+ category.getLabel() + " done");
+ }
+ }
+}