In [1]:
import csv
import os
import sys

import pandas as pd

DATA_DIR = '/Users/adam/downloads/'

csv.field_size_limit(sys.maxsize)

projects = {}

for filename in os.listdir(DATA_DIR):
    if not filename.endswith('-classifications.csv'):
        continue
    
    classifications = pd.read_csv(os.path.join(DATA_DIR, filename))
    users = projects.setdefault(
        filename.replace('-classifications.csv', ''), 
        set(map(int, classifications[classifications.user_id.notnull()].user_id.unique()))
    )
    print('{}: {} users'.format(filename, len(users)))
manatee-chat-classifications.csv: 1192 users
variable-star-zoo-classifications.csv: 581 users
grouse-grooves-classifications.csv: 975 users
cheetahs-of-central-namibia-classifications.csv: 3507 users
canid-camera-classifications.csv: 3605 users
/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
treeversity-classifications.csv: 1662 users
project-plumage-classifications.csv: 1909 users
sounds-of-new-york-city-sonyc-classifications.csv: 246 users
orangutan-nest-watch-classifications.csv: 1847 users
wild-wolf-watch-classifications.csv: 582 users
seabirdwatch-classifications.csv: 8859 users
space-warps-hsc-classifications.csv: 4267 users
the-american-soldier-classifications.csv: 1560 users
floating-forests-classifications.csv: 801 users
london-bird-records-classifications.csv: 295 users
1961-census-classifications.csv: 242 users
beluga-bits-classifications.csv: 2069 users
reading-natures-library-classifications.csv: 1828 users
snapshot-grumeti-classifications.csv: 2676 users
rainforest-flowers-classifications.csv: 1523 users
calgary-captured-classifications.csv: 2125 users
the-wilds-wildlife-watch-classifications.csv: 771 users
penguin-watch-classifications.csv: 2150 users
snapshot-serengeti-classifications.csv: 3467 users
interior-least-tern-and-piping-plover-predators-classifications.csv: 739 users
protect-our-planet-from-solar-storms-classifications.csv: 1072 users
scotus-notes-behind-the-scenes-at-supreme-court-conference-classifications.csv: 1793 users
invader-id-classifications.csv: 1088 users
snapshot-ruaha-classifications.csv: 1533 users
parochial-archive-project-in-rome-classifications.csv: 422 users
science-scribbler-classifications.csv: 1309 users
the-community-seagrass-initiative-seagrass-explorer-classifications.csv: 475 users
parasite-safari-classifications.csv: 2083 users
galaxy-zoo-classifications.csv: 9962 users
In [2]:
sources = []
targets = []
counts = []

for project, users in projects.items():
    for other_project, other_users in projects.items():
        if project is other_project:
            continue
        if other_project in sources:
            continue
        common_users = len(users & other_users)
        sources.append(project)
        targets.append(other_project)
        counts.append(common_users)
In [3]:
data = pd.DataFrame({'sources': sources, 'targets': targets, 'counts': counts})
In [5]:
import holoviews as hv

hv.extension('bokeh')
%opts Chord [color_index='index' label_index='index' edge_color_index='sources' width=800 height=800]
%opts Chord (cmap='Category20' edge_cmap='Category20')
graph = hv.Chord(data)
graph
Out[5]: