Join GitHub today
GitHub is home to over 36 million developers working together to host and review code, manage projects, and build software together.
Sign upQuery idea: 3 longest words in title of works with no main subject #1118
Comments
This comment has been minimized.
This comment has been minimized.
also posted at |
This comment has been minimized.
This comment has been minimized.
Also from StackOverflow is a Jupyter notebook that does the per-string part (in Python): https://paws-public.wmflabs.org/paws-public/User:Luitzen/Stack/Split%20titles.ipynb?kernel_name=python3 (archived) |
Daniel-Mietchen
added
the
questions-queries
label
Feb 21, 2019
This comment has been minimized.
This comment has been minimized.
Based on this query, here is an adaptation for papers by a given author: SELECT (SAMPLE(DISTINCT ?x) AS ?item) ?w (COUNT(DISTINCT ?x) AS ?c) (STRLEN(?w) AS ?l) WHERE {
{
SELECT DISTINCT ?x ?title WHERE {
?x schema:dateModified ?date_modified ;
# wdt:P921 wd:Q202864 ;
wdt:P50 wd:Q46155812 ;
wdt:P1476 ?title.
BIND (now() - ?date_modified as ?date_range)
FILTER(STRLEN(?title) >= 6)
}
LIMIT 10000
}
FILTER NOT EXISTS { ?x wdt:P921 ?topic .}
BIND(LCASE(?title) AS ?ltitle)
BIND(REPLACE(?ltitle, "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w1)
BIND(REPLACE(STRAFTER(?ltitle, ?w1), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w2)
BIND(REPLACE(STRAFTER(?ltitle, ?w2), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w3)
BIND(REPLACE(STRAFTER(?ltitle, ?w3), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w4)
BIND(REPLACE(STRAFTER(?ltitle, ?w4), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w5)
BIND(REPLACE(STRAFTER(?ltitle, ?w5), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w6)
BIND(REPLACE(STRAFTER(?ltitle, ?w6), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w7)
BIND(REPLACE(STRAFTER(?ltitle, ?w7), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w8)
BIND(REPLACE(STRAFTER(?ltitle, ?w8), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w9)
BIND(REPLACE(STRAFTER(?ltitle, ?w9), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w10)
VALUES ?w_ { 1 2 3 4 5 6 7 8 9 10}
BIND(IF(?w_ = 1, ?w1, IF(?w_ = 2, ?w2, IF(?w_ = 3, ?w3, IF(?w_ = 4, ?w4, IF(?w_ = 5, ?w5, IF(?w_ = 6, ?w6, IF(?w_ = 7, ?w7,
IF(?w_ = 8, ?w8, IF(?w_ = 9, ?w9, ?w10))))))))) AS ?w)
FILTER(REGEX(?w, "^\\w+$")) # since ?w may evaluate to an empty string, e.g. for one-word titles
}
GROUP BY ?item ?w
ORDER BY DESC(?l) DESC(?c)
LIMIT 2000 |
This comment has been minimized.
This comment has been minimized.
SELECT (SAMPLE(DISTINCT ?x) AS ?item) ?w (COUNT(DISTINCT ?x) AS ?c) (STRLEN(?w) AS ?l) WHERE {
{
SELECT DISTINCT ?x ?title WHERE {
?x schema:dateModified ?date_modified ;
# wdt:P921 wd:Q202864 ;
wdt:P50 wd:Q46155812 ;
wdt:P1476 ?title.
BIND (now() - ?date_modified as ?date_range)
FILTER(STRLEN(?title) >= 5)
}
LIMIT 10000
}
FILTER NOT EXISTS { ?x wdt:P921 ?topic .}
BIND(LCASE(?title) AS ?ltitle)
BIND(REPLACE(?ltitle, "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w1)
BIND(REPLACE(STRAFTER(?ltitle, ?w1), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w2)
BIND(REPLACE(STRAFTER(?ltitle, ?w2), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w3)
BIND(REPLACE(STRAFTER(?ltitle, ?w3), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w4)
BIND(REPLACE(STRAFTER(?ltitle, ?w4), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w5)
BIND(REPLACE(STRAFTER(?ltitle, ?w5), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w6)
BIND(REPLACE(STRAFTER(?ltitle, ?w6), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w7)
BIND(REPLACE(STRAFTER(?ltitle, ?w7), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w8)
BIND(REPLACE(STRAFTER(?ltitle, ?w8), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w9)
BIND(REPLACE(STRAFTER(?ltitle, ?w9), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w10)
VALUES ?w_ { 1 2 3 4 5 6 7 8 9 10}
BIND(IF(?w_ = 1, ?w1, IF(?w_ = 2, ?w2, IF(?w_ = 3, ?w3, IF(?w_ = 4, ?w4, IF(?w_ = 5, ?w5, IF(?w_ = 6, ?w6, IF(?w_ = 7, ?w7,
IF(?w_ = 8, ?w8, IF(?w_ = 9, ?w9, ?w10))))))))) AS ?w)
FILTER(REGEX(?w, "^\\w+$")) # since ?w may evaluate to an empty string, e.g. for one-word titles
}
GROUP BY ?item ?w
ORDER BY DESC(?c) DESC(?l)
LIMIT 2000 Compared to the previous one, the differences are
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Daniel-Mietchen commentedJan 12, 2019
The hope is that this could help with topic tagging.
Started to work on this last night and posted at
https://www.wikidata.org/w/index.php?title=Wikidata:Request_a_query&oldid=832978769#Find_longest_substrings_in_a_title but still don't have a real solution.
The partial solution of sorting works by the length of their first or second word, however, is already there, and it is useful for identifying topics that need attention.