|
@@ -3,21 +3,17 @@ |
|
|
// Heuristic clustering algorithm |
|
|
$min_authors_for_cluster = 4 ; |
|
|
$score_cache = array() ; |
|
|
function compareArticles( $wil, $q1 , $q2 ) { |
|
|
function compareArticles( $article1 , $article2 ) { |
|
|
global $score_cache , $min_authors_for_cluster ; |
|
|
|
|
|
if ( $q1 > $q2 ) return compareArticles ( $wil, $q2 , $q1 ) ; // Enforce $q1 <= $q2 |
|
|
$q1 = $article1->q ; |
|
|
$q2 = $article2->q ; |
|
|
if ( $q1 > $q2 ) return compareArticles ( $article2 , $article1 ) ; // Enforce $q1 <= $q2 |
|
|
$key = "$q1|$q2" ; |
|
|
if ( isset($score_cache[$key]) ) return $score_cache[$key] ; |
|
|
|
|
|
$i1 = $wil->getItem ( $q1 ) ; |
|
|
$i2 = $wil->getItem ( $q2 ) ; |
|
|
if ( !isset($i1) or !isset($i2) ) return 0 ; |
|
|
$authors1 = $i1->getStrings ( 'P2093' ) ; |
|
|
$authors2 = $i2->getStrings ( 'P2093' ) ; |
|
|
|
|
|
foreach ( $i1->getClaims('P50') AS $claim ) $authors1[] = $i1->getTarget ( $claim ) ; |
|
|
foreach ( $i2->getClaims('P50') AS $claim ) $authors2[] = $i2->getTarget ( $claim ) ; |
|
|
$authors1 = array_merge( $article1->author_names, $article1->authors ) ; |
|
|
$authors2 = array_merge( $article2->author_names, $article2->authors ) ; |
|
|
|
|
|
$score = 0 ; |
|
|
if ( count($authors1) < $min_authors_for_cluster or count($authors2) < $min_authors_for_cluster ) { |
|
@@ -39,36 +35,43 @@ function compareArticles( $wil, $q1 , $q2 ) { |
|
|
return $score ; |
|
|
} |
|
|
|
|
|
function cluster_articles ( $wil, $items_papers ) { |
|
|
function cluster_articles ( $article_items ) { |
|
|
$clusters = array() ; |
|
|
$min_score = 30 ; |
|
|
$is_in_cluster = array() ; |
|
|
foreach ( $items_papers AS $q1 ) { |
|
|
foreach ( $article_items AS $article ) { |
|
|
$q1 = $article->q ; |
|
|
if ( isset($is_in_cluster[$q1]) ) continue ; |
|
|
$base_score = compareArticles ( $wil, $q1 , $q1 ) ; |
|
|
$base_score = compareArticles ( $article , $article ) ; |
|
|
if ( $base_score == 0 ) continue ; |
|
|
$cluster = array() ; |
|
|
foreach ( $items_papers AS $q2 ) { |
|
|
foreach ( $article_items AS $article2 ) { |
|
|
$q2 = $article2->q ; |
|
|
if ( $q1 == $q2 ) continue ; |
|
|
if ( isset($is_in_cluster[$q2]) ) continue ; |
|
|
$score = compareArticles ( $wil, $q1 , $q2 ) ; |
|
|
$score = compareArticles ( $article , $article2 ) ; |
|
|
$score = 100 * $score / $base_score ; |
|
|
if ( $score >= $min_score ) { |
|
|
if ( count($cluster) == 0 ) $cluster[] = $q1 ; |
|
|
$cluster[] = $q2 ; |
|
|
if ( count($cluster) == 0 ) $cluster[] = $article ; |
|
|
$cluster[] = $article2 ; |
|
|
} |
|
|
} |
|
|
|
|
|
if ( count($cluster) == 0 ) continue ; |
|
|
foreach ( $cluster AS $q ) $is_in_cluster[$q] = $q ; |
|
|
usort( $cluster, 'WikidataArticleEntry::dateCompare' ); |
|
|
foreach ( $cluster AS $c ) $is_in_cluster[$c->q] = 1 ; |
|
|
$clusters['Group #'.(count($clusters)+1)] = $cluster ; |
|
|
} |
|
|
$cluster = array() ; |
|
|
foreach ( $items_papers AS $q1 ) { |
|
|
if ( isset($is_in_cluster[$q1]) ) continue ; |
|
|
$cluster[] = $q1 ; |
|
|
foreach ( $article_items AS $article ) { |
|
|
if ( isset($is_in_cluster[$article->q]) ) continue ; |
|
|
$cluster[] = $article ; |
|
|
} |
|
|
if ( count($cluster) > 0 ) $clusters['Misc'] = $cluster ; |
|
|
if ( count($cluster) > 0 ) { |
|
|
usort( $cluster, 'WikidataArticleEntry::dateCompare' ); |
|
|
$clusters['Misc'] = $cluster ; |
|
|
} |
|
|
|
|
|
return $clusters; |
|
|
} |
|
|
|
|
|
0 comments on commit
8d0e563