public final String link;
public final Date publicationDate;
public final String website;
+ public final String[] entities;
public final AtomicLong readCount = new AtomicLong();
- public Article(String link, String title, String description, String thumbnail, Date publicationDate, String website) {
+ public Article(String link, String title, String description, String thumbnail, Date publicationDate, String website, String[] entities) {
this.link = link;
this.title = title;
this.description = description;
this.thumbnail = thumbnail;
this.publicationDate = publicationDate;
this.website = website;
+ this.entities = entities;
+ }
+
+ public String[] getEntities() {
+ return entities;
}
}
--- /dev/null
+package pnews;
+
+public class EntityStat {
+ private final String entity;
+ private int count;
+
+ public EntityStat(String entity) {
+ this.entity = entity;
+ }
+
+ public void increment() {
+ count++;
+ }
+
+ public int getCount() {
+ return count;
+ }
+
+ public String getEntity() {
+ return entity;
+ }
+
+ @Override
+ public String toString() {
+ return entity + "(" + count + ")";
+ }
+}
package pnews;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.List;
+import java.util.logging.Logger;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
/** https://stanfordnlp.github.io/CoreNLP/api.html */
public class NER {
- public static void classify(String str) throws ClassCastException, ClassNotFoundException, IOException {
+ private static final String CLASS_NAME = NER.class.getName();
+ private static final Logger LOG = Logger.getLogger(CLASS_NAME);
+
+ public static String[] classify(String str) throws ClassCastException, ClassNotFoundException, IOException {
CRFClassifier<CoreLabel> classifier;
List<List<CoreLabel>> out;
String cat, w;
+ List<String> entities;
+ final String FUNCTION_NAME = "classify";
+
+ LOG.entering(CLASS_NAME, FUNCTION_NAME, str);
classifier = CRFClassifier.getDefaultClassifier();
out = classifier.classify(str);
+ entities = new ArrayList<>();
for (List<CoreLabel> labels: out)
for (CoreLabel l: labels) {
cat = l.getString(AnswerAnnotation.class);
w = l.word();
- System.out.println(cat + " " + w);
+ if (!cat.equals("O") && !entities.contains(w))
+ entities.add(w);
}
+
+ LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities);
+
+ return entities.toArray(new String[0]);
}
public static void main(String[] args) throws Exception {
import pnews.Article;
import pnews.Category;
+import pnews.EntityStat;
import pnews.Feed;
+import pnews.NER;
public class ArticleProvider {
- private static final Logger LOG = Logger.getLogger(ArticleProvider.class.getName());
+ private static final String CLASS_NAME = ArticleProvider.class.getName();
+ private static final Logger LOG = Logger.getLogger(CLASS_NAME);
private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2);
private final Config config;
return false;
}
- private static Article toArticle(String link, SyndEntry entry, SyndFeed feed) {
+ private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
String desc, title, thumbnail, feedTitle, str;
Date date;
+ String[] entities;
feedTitle = feed.getTitle();
if (feedTitle != null) {
if (date == null)
LOG.severe("The article " + feedTitle + " - " + title + " does not have a date");
- return new Article(link, title, desc, thumbnail, date, feedTitle);
+
+ entities = null;
+ if (desc != null && lang.equals("en"))
+ try {
+ entities = NER.classify(desc);
+ } catch (ClassCastException | ClassNotFoundException | IOException e1) {
+ LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);
+ }
+
+ return new Article(link, title, desc, thumbnail, date, feedTitle, entities);
}
private void addArticles(Category cat, SyndFeed feed) {
continue ;
}
- a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed));
+ a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage()));
synchronized (articles) {
articles.add(a);
}
}
+ public List<EntityStat> getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
+ List<Article> articles;
+ Map<String, EntityStat> entities;
+ final String FUNCTION_NAME = "getEntities";
+ EntityStat s;
+ List<EntityStat> stats;
+
+ LOG.entering(CLASS_NAME, FUNCTION_NAME, cat);
+
+ articles = getArticles(cat);
+
+ entities = new HashMap<>();
+ for (Article a: articles)
+ if (a.getEntities() != null)
+ for (String e: a.getEntities()) {
+ s = entities.get(e);
+ if (s == null) {
+ s = new EntityStat(e);
+ entities.put(e, s);
+ }
+ s.increment();
+ }
+
+ stats = new ArrayList<>(entities.values());
+ stats.sort(new Comparator<EntityStat>() {
+
+ @Override
+ public int compare(EntityStat o1, EntityStat o2) {
+ return Integer.compare(o2.getCount(), o1.getCount());
+ }
+
+ });
+
+ LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats);
+
+ return stats;
+ }
+
private class Refresher implements Runnable {
private final Category category;
package pnews.servlet;
+import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
+import com.rometools.rome.io.FeedException;
+
import pnews.Article;
import pnews.Category;
+import pnews.EntityStat;
import pnews.Language;
public class HTML {
buf.append("</nav>\n");
}
- public static String toHTML(List<Article> articles, Category catActive, Config cfg) {
+ public static String toHTML(List<Article> articles, Category catActive, Config cfg, ArticleProvider provider) {
StringBuffer buf;
int i;
Category[] cats;
+ List<EntityStat> entities;
buf = new StringBuffer();
buf.append("<!DOCTYPE html>\n");
appendMenu(buf, catActive, cfg);
+ try {
+ entities = provider.getEntityStats(catActive);
+
+ if (entities.size() > 0) {
+ buf.append("Hot topics: ");
+ buf.append("<ul>");
+ i = 0;
+ for (EntityStat s: entities) {
+ buf.append("<li>");
+ buf.append(s.getEntity());
+ buf.append("</li>");
+ i++;
+ if (i > 10)
+ break;
+ }
+ buf.append("</ul>");
+ }
+ } catch (IllegalArgumentException | FeedException | IOException e2) {
+ LOG.log(Level.SEVERE, "Failed to get entities", e2);
+ }
+
i = 0;
for (Article e: articles) {
try {
try {
articles = provider.getArticles(cat);
if (articles != null) {
- html = HTML.toHTML(articles, cat, config);
+ html = HTML.toHTML(articles, cat, config, provider);
rp.setContentType("text/html;charset=utf-8");
rp.getWriter().write(html);
rp.setCharacterEncoding("utf-8");
} else {
LOG.severe("writeArticles cannot retrieve any articles");
- html = HTML.toHTML(new ArrayList<Article>(), cat, config);
+ html = HTML.toHTML(new ArrayList<Article>(), cat, config, provider);
rp.setContentType("text/html");
rp.getWriter().write(html);
}
"title": "People",
"language": "fr"
}, {
- "id": "ubuntu",
- "label": "Ubuntu",
- "title": "Ubuntu",
+ "id": "en_technologie",
+ "label": "Technologie",
+ "title": "Technologie",
"language": "en"
}
],
"http://www.ville-palaiseau.fr/rss/actualites.htm": { "categories": ["essonne"] },
"http://www.premiere.fr/rss/actu-live": { "categories": ["people"] },
"http://www.purepeople.com/rss/news_t0.xml": { "categories": ["people"] },
+ "http://www.01net.com/rss/info/flux-rss/flux-toutes-les-actualites/": { "categories": ["technologie"] },
"http://www.generation-nt.com/export/rss.xml": { "categories": ["technologie"] },
"http://www.europe1.fr/var/export/rss/europe1/sciences.xml": { "categories": ["technologie"] },
"http://www.europe1.fr/var/export/rss/europe1/technologies.xml": { "categories": ["technologie"]},
"http://www.futura-sciences.com/rss/actualites.xml": { "categories": ["technologie"] },
"https://www-03.ibm.com/press/fr/fr/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["technologie"] },
"https://korben.info/feed": { "categories": ["technologie"]},
- "https://insights.ubuntu.com/feed/": { "categories": ["ubuntu"]}
+ "https://insights.ubuntu.com/feed/": { "categories": ["en_technologie"]},
+ "https://www-03.ibm.com/press/us/en/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["en_technologie"]},
+ "https://www.cnet.com/rss/news/": { "categories": ["en_technologie"]},
+ "https://www.pcworld.com/index.rss": { "categories": ["en_technologie"]},
+ "https://www.technologyreview.com/c/computing/rss/": { "categories": ["en_technologie"]},
+ "https://www.techworld.com/news/rss": { "categories": ["en_technologie"]},
+ "http://feeds.feedburner.com/TechCrunch/": { "categories": ["en_technologie"]},
+ "http://feeds.macrumors.com/MacRumors-All": { "categories": ["en_technologie"]},
+ "https://www.digitaltrends.com/rss-home/": { "categories": ["en_technologie"]},
+ "http://www.zdnet.com/news/rss.xml": { "categories": ["en_technologie"]},
+ "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}
}
}