![]() |
FREYA WP2 User Story3 | As an administrator for the University of Oxford I am interested in the reuse of research outputs from our university, so that I can help identify the most interesting research outputs. |
---|---|---|
It is important for research organisations to measure quality and quantity of their outputs as well as their relevance to latest global research trends and to their own strategic science direction.
This notebook uses the DataCite GraphQL API to retrieve up to 100 outputs (e.g. publications or datasets) from University of Oxford in order to quantify and visualise their reuse.Goal: By the end of this notebook, for a given organization, you should be able to display:
%%capture
# Install required Python packages
!pip install gql requests numpy plotly pyvis
# Prepare the GraphQL client
import requests
from IPython.display import display, Markdown
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
_transport = RequestsHTTPTransport(
url='https://api.datacite.org/graphql',
use_json=True,
)
client = Client(
transport=_transport,
fetch_schema_from_transport=True,
)
Define the GraphQL query to retrieve up to 100 outputs (e.g. publications or datasets) from University of Oxford, using its Research Organization Registry (ROR) identifier.
# Generate the GraphQL query to retrieve up to 100 outputs of University of Oxford, with at least 100 views each.
query_params = {
"rorId" : "https://ror.org/052gg0110",
"maxOutputs": 100,
"minViews" : 100
}
query = gql("""query getOutputs($rorId: ID!, $maxOutputs: Int!, $minViews: Int!)
{
organization(id: $rorId) {
id
name
alternateName
citationCount
viewCount
downloadCount
works(hasViews: $minViews, first: $maxOutputs) {
totalCount
published {
title
count
}
resourceTypes {
title
count
}
nodes {
id
type
publisher
publicationYear
titles {
title
}
citations {
nodes {
id
titles {
title
}
}
}
creators {
id
name
affiliation {
id
name
}
}
citationCount
viewCount
downloadCount
}
}
}
}
""")
Run the above query via the GraphQL client
import json
data = client.execute(query, variable_values=json.dumps(query_params))
Display total number of citations, views and downloads of University of Oxford's outputs.
# Get the total count per metric, aggregated across all of the organization's outputs
organization = data['organization']
organizationName = organization['name']
# Initialise metric counts across all outputs of the organization
metricCounts = {}
for metric in ['citationCount', 'viewCount', 'downloadCount']:
metricCounts[metric] = 0
# Aggregate metric counts across all the parts
for node in organization['works']['nodes']:
for metric in metricCounts:
metricCounts[metric] += node[metric]
# Display the aggregated metric counts
tableBody=""
for metric in metricCounts:
tableBody += "%s | **%s**\n" % (metric, str(metricCounts[metric]))
if tableBody:
display(Markdown("Aggregated metric counts across %d outputs of [University of Oxford](https://ror.org/052gg0110):" % organization['works']['totalCount']))
display(Markdown("|Metric | Aggregated Count|\n|---|---|\n%s" % tableBody))
Aggregated metric counts across 265 outputs of University of Oxford:
Metric | Aggregated Count |
---|---|
citationCount | 100 viewCount | 27997 downloadCount | 10925
Plot stacked bar plot showing how each type of University of Oxford's outputs contribute their metric counts to the corresponding aggregated total.
import plotly.io as pio
import plotly.express as px
from IPython.display import IFrame
import pandas as pd
# Adapted from: https://stackoverflow.com/questions/58766305/is-there-any-way-to-implement-stacked-or-grouped-bar-charts-in-plotly-express
def px_stacked_bar(df, color_name='Metric', y_name='Metrics', **pxargs):
idx_col = df.index.name
m = pd.melt(df.reset_index(), id_vars=idx_col, var_name=color_name, value_name=y_name)
# For Plotly colour sequences see: https://plotly.com/python/discrete-color/
return px.bar(m, x=idx_col, y=y_name, color=color_name, **pxargs,
color_discrete_sequence=px.colors.qualitative.Pastel1)
# Collect metric counts
organization = data['organization']
# Initialise dicts for the stacked bar plot
labels = {0: 'All Output Types'}
citationCounts = {}
viewCounts = {}
downloadCounts = {}
# Collect output type labels
outputTypesSet = set([])
outputType2Pos = {}
for node in organization['works']['nodes']:
outputTypesSet.add(node['type'])
outputTypes = list(outputTypesSet);
for pos, outputType in enumerate(outputTypes):
labels[pos + 1] = outputType
outputType2Pos[outputType] = pos + 1
# Initialise metric counts
for pos, _ in enumerate(labels):
citationCounts[pos] = 0
viewCounts[pos] = 0
downloadCounts[pos] = 0
# Populate metric counts per output type (key = i) and add them to the aggregated counts (key: 0)
for node in organization['works']['nodes']:
pos = outputType2Pos[node['type']]
citationCounts[0] += node['citationCount']
viewCounts[0] += node['viewCount']
downloadCounts[0] += node['downloadCount']
citationCounts[pos] += node['citationCount']
viewCounts[pos] += node['viewCount']
downloadCounts[pos] += node['downloadCount']
# Create stacked bar plot
x_name = "%s's Output Types" % organizationName
df = pd.DataFrame({x_name: labels,
'Citations': citationCounts,
'Views': viewCounts,
'Downloads': downloadCounts})
fig = px_stacked_bar(df.set_index(x_name), y_name = "Counts")
# Set plot background to transparent
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)'
})
# Write interactive plot out to html file
# pio.write_html(fig, file='ot_out.html')
# Display plot from the saved html file
display(Markdown("Citations, views and downloads for [University of Oxford](https://ror.org/052gg0110)'s outputs, shown per output type as stacked bar plot:"))
# IFrame(src="./ot_out.html", width=500, height=500)
fig.show()
Citations, views and downloads for University of Oxford's outputs, shown per output type as stacked bar plot: