Skip to content
Permalink
Browse files

Use ArticleItem object during clustering as well as display; add sort…

… by date function and use that to order each cluster.
  • Loading branch information...
arthurpsmith committed Nov 29, 2018
1 parent 71fb84e commit 8d0e563cbd2cde49edf2a6dff9de27a8a919348f
Showing with 49 additions and 40 deletions.
  1. +14 −17 index.php
  2. +9 −0 lib/article_model.php
  3. +25 −22 lib/clustering.php
  4. +1 −1 resources/html/index_bs4.html
@@ -68,7 +68,6 @@ function getORCIDurl ( $s ) {
foreach ( $items_authors AS $q ) $to_load[] = $q ;
$wil->loadItems ( $to_load ) ;
$delete_statements = array() ;
if ( $action == 'add' ) {
print "<form method='post' class='form' action='https://tools.wmflabs.org/quickstatements/api.php'>" ;
@@ -111,26 +110,28 @@ function getORCIDurl ( $s ) {
<input type='hidden' name='fuzzy' value='$fuzzy' />
<input type='hidden' name='name' value='" . escape_attribute($name) . "' />" ;
$clusters = cluster_articles ( $wil, $items_papers ) ;
#print "<pre>" ; print_r ( $clusters ) ; print "</pre>" ;
// P50 authors
$to_load = array() ;
$article_items = array();
foreach ( $items_papers AS $q ) {
$i = $wil->getItem ( $q ) ;
if ( !isset($i) ) continue ;
$claims = $i->getClaims ( 'P50' ) ;
foreach ( $claims AS $c ) $to_load[] = $i->getTarget ( $c ) ;
$claims = $i->getClaims ( 'P1433' ) ;
foreach ( $claims AS $c ) $to_load[] = $i->getTarget ( $c ) ;
$article = new WikidataArticleEntry( $i ) ;
$article_items[] = $article ;
foreach ( $article->authors AS $auth ) $to_load[] = $auth ;
foreach ( $article->published_in AS $pub ) $to_load[] = $pub ;
foreach ( $article->topics AS $topic ) $to_load[] = $topic ;
}
$wil->loadItems ( $to_load ) ;
$clusters = cluster_articles ( $article_items ) ;
#print "<pre>" ; print_r ( $clusters ) ; print "</pre>" ;
// Publications
$name_counter = array() ;
print "<h2>Potential publications</h2>" ;
print "<p>" . count($items_papers) . " publications found</p>" ;
print "<p>" . count($article_items) . " publications found</p>" ;
$is_first_group = true ;
foreach ( $clusters AS $cluster_name => $cluster ) {
@@ -148,11 +149,8 @@ function getORCIDurl ( $s ) {
print "<th>Author Name Strings</th><th>Identified Authors</th>" ;
print "<th>Published In</th><th>Identifier(s)</th>" ;
print "<th>Topic</th><th>Published Date</th></tr>" ;
foreach ( $cluster AS $q ) {
$q = "Q$q" ;
$i = $wil->getItem ( $q ) ;
if ( !isset($i) ) continue ;
$article = new WikidataArticleEntry( $i ) ;
foreach ( $cluster AS $article ) {
$q = $article->q ;
$out = array() ;
foreach ( $article->author_names AS $a ) {
@@ -198,7 +196,6 @@ function getORCIDurl ( $s ) {
if ( count($article->topics) > 0 ) {
$topics = [] ;
foreach ( $article->topics AS $qt ) {
$wil->loadItem ( $qt ) ;
$i2 = $wil->getItem($qt) ;
if ( !isset($i2) ) continue ;
$topics[] = "<a href='https://www.wikidata.org/wiki/" . $i2->getQ() . "' target='_blank' style='color:green'>" . $i2->getLabel() . "</a>" ;
@@ -75,5 +75,14 @@ public function formattedPublicationDate () {
if ( $this->publication_date != '' ) $formatted_date = DateTime::createFromFormat( '\+Y-m-d\TH:i:s\Z', $this->publication_date )->format( "Y-m-d" );
return $formatted_date ;
}
public static function dateCompare ($a, $b) {
$adate = $a->publication_date ;
$bdate = $b->publication_date ;
if ($adate == $bdate) {
return 0;
}
return ($adate > $bdate) ? -1 : 1 ;
}
}
?>
@@ -3,21 +3,17 @@
// Heuristic clustering algorithm
$min_authors_for_cluster = 4 ;
$score_cache = array() ;
function compareArticles( $wil, $q1 , $q2 ) {
function compareArticles( $article1 , $article2 ) {
global $score_cache , $min_authors_for_cluster ;
if ( $q1 > $q2 ) return compareArticles ( $wil, $q2 , $q1 ) ; // Enforce $q1 <= $q2
$q1 = $article1->q ;
$q2 = $article2->q ;
if ( $q1 > $q2 ) return compareArticles ( $article2 , $article1 ) ; // Enforce $q1 <= $q2
$key = "$q1|$q2" ;
if ( isset($score_cache[$key]) ) return $score_cache[$key] ;
$i1 = $wil->getItem ( $q1 ) ;
$i2 = $wil->getItem ( $q2 ) ;
if ( !isset($i1) or !isset($i2) ) return 0 ;
$authors1 = $i1->getStrings ( 'P2093' ) ;
$authors2 = $i2->getStrings ( 'P2093' ) ;
foreach ( $i1->getClaims('P50') AS $claim ) $authors1[] = $i1->getTarget ( $claim ) ;
foreach ( $i2->getClaims('P50') AS $claim ) $authors2[] = $i2->getTarget ( $claim ) ;
$authors1 = array_merge( $article1->author_names, $article1->authors ) ;
$authors2 = array_merge( $article2->author_names, $article2->authors ) ;
$score = 0 ;
if ( count($authors1) < $min_authors_for_cluster or count($authors2) < $min_authors_for_cluster ) {
@@ -39,36 +35,43 @@ function compareArticles( $wil, $q1 , $q2 ) {
return $score ;
}
function cluster_articles ( $wil, $items_papers ) {
function cluster_articles ( $article_items ) {
$clusters = array() ;
$min_score = 30 ;
$is_in_cluster = array() ;
foreach ( $items_papers AS $q1 ) {
foreach ( $article_items AS $article ) {
$q1 = $article->q ;
if ( isset($is_in_cluster[$q1]) ) continue ;
$base_score = compareArticles ( $wil, $q1 , $q1 ) ;
$base_score = compareArticles ( $article , $article ) ;
if ( $base_score == 0 ) continue ;
$cluster = array() ;
foreach ( $items_papers AS $q2 ) {
foreach ( $article_items AS $article2 ) {
$q2 = $article2->q ;
if ( $q1 == $q2 ) continue ;
if ( isset($is_in_cluster[$q2]) ) continue ;
$score = compareArticles ( $wil, $q1 , $q2 ) ;
$score = compareArticles ( $article , $article2 ) ;
$score = 100 * $score / $base_score ;
if ( $score >= $min_score ) {
if ( count($cluster) == 0 ) $cluster[] = $q1 ;
$cluster[] = $q2 ;
if ( count($cluster) == 0 ) $cluster[] = $article ;
$cluster[] = $article2 ;
}
}
if ( count($cluster) == 0 ) continue ;
foreach ( $cluster AS $q ) $is_in_cluster[$q] = $q ;
usort( $cluster, 'WikidataArticleEntry::dateCompare' );
foreach ( $cluster AS $c ) $is_in_cluster[$c->q] = 1 ;
$clusters['Group #'.(count($clusters)+1)] = $cluster ;
}
$cluster = array() ;
foreach ( $items_papers AS $q1 ) {
if ( isset($is_in_cluster[$q1]) ) continue ;
$cluster[] = $q1 ;
foreach ( $article_items AS $article ) {
if ( isset($is_in_cluster[$article->q]) ) continue ;
$cluster[] = $article ;
}
if ( count($cluster) > 0 ) $clusters['Misc'] = $cluster ;
if ( count($cluster) > 0 ) {
usort( $cluster, 'WikidataArticleEntry::dateCompare' );
$clusters['Misc'] = $cluster ;
}
return $clusters;
}
@@ -52,4 +52,4 @@
</div>
</div>
</body>
</html>
</html>

0 comments on commit 8d0e563

Please sign in to comment.
You can’t perform that action at this time.