3 import java.io.IOException;
4 import java.net.MalformedURLException;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.Comparator;
10 import java.util.HashMap;
11 import java.util.List;
13 import java.util.concurrent.Executors;
14 import java.util.concurrent.ScheduledExecutorService;
15 import java.util.concurrent.TimeUnit;
16 import java.util.logging.Level;
17 import java.util.logging.Logger;
19 import org.jsoup.Jsoup;
21 import com.rometools.rome.feed.synd.SyndEnclosure;
22 import com.rometools.rome.feed.synd.SyndEntry;
23 import com.rometools.rome.feed.synd.SyndFeed;
24 import com.rometools.rome.io.FeedException;
25 import com.rometools.rome.io.SyndFeedInput;
26 import com.rometools.rome.io.XmlReader;
29 import pnews.Category;
30 import pnews.EntityStat;
34 public class ArticleProvider {
35 private static final String CLASS_NAME = ArticleProvider.class.getName();
36 private static final Logger LOG = Logger.getLogger(CLASS_NAME);
37 private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
38 private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2);
39 private final Config config;
41 public ArticleProvider(Config config) {
43 for (Category cat: config.getCategories())
44 scheduler.scheduleAtFixedRate(new Refresher(cat), 2, 600, TimeUnit.SECONDS);
47 private static SyndFeed getSyndFeed(String u) throws IllegalArgumentException, FeedException, MalformedURLException, IOException {
50 r = new XmlReader(new URL(u));
52 return new SyndFeedInput().build(r);
55 private List<Article> getArticlesForUpdate(Category cat) {
58 synchronized (articlesByCategory) {
59 result = articlesByCategory.get(cat);
61 result = new ArrayList<>();
62 articlesByCategory.put(cat, result);
68 private boolean exists(String articleLink, List<Article> articles) {
69 synchronized (articles) {
70 for (Article a: articles)
71 if (a.link.equals(articleLink))
77 private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
78 String desc, title, thumbnail, feedTitle, str;
82 feedTitle = feed.getTitle();
83 if (feedTitle != null) {
84 feedTitle = feedTitle.trim();
88 for (SyndEnclosure e: entry.getEnclosures()) {
89 if (e.getType().startsWith("image/"))
90 thumbnail = e.getUrl();
94 if (thumbnail == null && feed.getImage() != null)
95 thumbnail = feed.getImage().getUrl();
98 title = entry.getTitle().trim();
100 if (entry.getDescription() != null) {
101 str = entry.getDescription().getValue();
102 desc = Jsoup.parse(str).text();
105 LOG.severe("No description for " + feedTitle + " - " + title);
108 date = entry.getPublishedDate();
110 date = entry.getUpdatedDate();
112 LOG.severe("The article " + feedTitle + " - " + title + " does not have a date");
116 if (desc != null && lang.equals("en"))
118 entities = NER.classify(desc);
119 } catch (ClassCastException | ClassNotFoundException | IOException e1) {
120 LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);
123 return new Article(link, title, desc, thumbnail, date, feedTitle, entities);
126 private void addArticles(Category cat, SyndFeed feed) {
128 List<Article> articles;
131 feedTitle = feed.getTitle().trim();
133 LOG.info("addArticles " + cat.getLabel() + " " + feedTitle + " number of articles: " + feed.getEntries().size());
135 for (SyndEntry entry: feed.getEntries()) {
136 String link = entry.getLink().trim();
137 articles = getArticlesForUpdate(cat);
138 if (exists(link, articles)) {
139 LOG.fine("addArticles " + link + " is already present");
143 a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage()));
145 synchronized (articles) {
148 Collections.sort(articles, new Comparator<Article>() {
150 public int compare(Article o1, Article o2) {
151 if (o1.publicationDate == o2.publicationDate)
153 if (o1.publicationDate == null)
155 if (o2.publicationDate == null)
157 return o2.publicationDate.compareTo(o1.publicationDate);
163 LOG.info("addArticles done " + cat.getLabel());
166 private void retrieveArticles(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
169 feeds = config.getFeedsByCategory().get(cat);
174 addArticles(cat, getSyndFeed(f.getURL()));
175 } catch (Throwable e) {
176 LOG.log(Level.SEVERE,
177 "retrieveArticles failure " + cat.getLabel() + " " + f.toString(),
181 LOG.severe("No feed for category " + cat);
187 public List<Article> getArticles(Category cat)
188 throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
189 List<Article> articles;
191 synchronized (articlesByCategory) {
192 articles = getArticlesForUpdate(cat);
195 synchronized (articles) {
196 return new ArrayList<>(articles);
200 public List<EntityStat> getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
201 List<Article> articles;
202 Map<String, EntityStat> entities;
203 final String FUNCTION_NAME = "getEntities";
205 List<EntityStat> stats;
207 LOG.entering(CLASS_NAME, FUNCTION_NAME, cat);
209 articles = getArticles(cat);
211 entities = new HashMap<>();
212 for (Article a: articles)
213 if (a.getEntities() != null)
214 for (String e: a.getEntities()) {
217 s = new EntityStat(e);
223 stats = new ArrayList<>(entities.values());
224 stats.sort(new Comparator<EntityStat>() {
227 public int compare(EntityStat o1, EntityStat o2) {
228 return Integer.compare(o2.getCount(), o1.getCount());
233 LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats);
238 private class Refresher implements Runnable {
239 private final Category category;
241 public Refresher(Category category) {
242 this.category = category;
247 LOG.info("refresher "+ category.getLabel());
250 retrieveArticles(category);
251 } catch (IllegalArgumentException | FeedException | IOException e) {
252 LOG.log(Level.SEVERE, "refresher failure", e);
255 LOG.info("refresher "+ category.getLabel() + " done");