Core Drama via Docuscope¶

In these experiments, I will repeat what we did with the Core Drama notebook, but this time using the Docuscope tag counts as features, rather than the 200 most common words

Setup¶

# this setup is meant to be common across all the demos...

# since the notebooks may not go where the module code is...
# change directory to the root - since that's where everything goes
import sys
import os

if os.path.basename(os.getcwd()) == "Notebooks":
    print("Changing directory from:",os.getcwd())
    os.chdir("..")
    print("                     to:",os.getcwd())
    
### Magic to make notebooks work nicely
%matplotlib inline

### standard useful python stuff
from collections import Counter, defaultdict
from importlib import reload

### plotting stuff
import pylab
import seaborn

### numerical stuff
import numpy
from scipy import stats

### my CorpusModeler package stuff
import Corpus.DataMatrix as DM
import Corpus.reduce 
import Corpus.cull 
import Corpus.linear
import Corpus.nonlinear
import Tests.plays as Plays

Changing directory from: C:\Users\gleic\Projects\corpusmodeler\Notebooks
                     to: C:\Users\gleic\Projects\corpusmodeler

p = Plays.readDScore()
p.columns["date"] = p.getMetaDataArray("date of writing")
Plays.addNames(p)
pn = p.normalize()
genres = p.getMetaDataArray("genre")
dates = p.getMetaDataArray("date")
authors = p.getMetaDataArray("author")

Culled from 1292 to 554 in 0.0014810562133789062

A first map - the most common LATs¶

counts = pn.wordCounts()
countOrd = numpy.argsort(counts)[::-1]
[pn.terms[i] for i in countOrd[:5]]

['DirectAddress',
 'SenseObject',
 'ReportingEvents',
 'PersonProperty',
 'FirstPer']

Plays.uplot(pn,xc=countOrd[0],yc=countOrd[1],oc=[countOrd[i] for i in [2,3,4]],fname="core_docuscope_mostCommon",toggles=["DirectAddress","SenseObject"])

pn.aucTable(genres,countOrd[:5])

               :      CO       TR       TC       HI 
  DirectAddress:   0.664    0.302    0.495    0.493 
    SenseObject:   0.451    0.574    0.386    0.706 
ReportingEvents:   0.367    0.681    0.423    0.621 
 PersonProperty:   0.701    0.283    0.370    0.681 
       FirstPer:   0.663    0.339    0.462    0.452

The most common LATs (unlike the most common words) aren't great for distinguishing comedies, let's see which LATs are best (have the AUC farthest from .5) for picking out comedies.

com = [1 if g=="CO" else 0 for g in genres]
auc = pn.auc(com)
aucMag = [abs(a-.5) for a in auc]
aucOrd = numpy.argsort(aucMag)[::-1]
pn.aucTable(genres,aucOrd[:10])

                 :      CO       TR       TC       HI 
            Anger:   0.185    0.814    0.595    0.533 
             Fear:   0.199    0.774    0.613    0.563 
       Negativity:   0.219    0.814    0.549    0.504 
CommonAuthorities:   0.236    0.681    0.536    0.874 
              Sad:   0.258    0.728    0.597    0.512 
     StandardsNeg:   0.259    0.727    0.588    0.533 
Negative_Relation:   0.276    0.704    0.575    0.566 
        Inclusive:   0.278    0.645    0.553    0.786 
         OralCues:   0.718    0.304    0.409    0.465 
          Numbers:   0.718    0.340    0.328    0.535

reload(Plays.PL)
Plays.uplot(pn,xc="Anger",yc="OralCues",oc=["Fear","Negativity","Numbers","CommonAuthorities"],fname="core_docuscope_bestCom",toggles=["Anger","OralCues"])

manualCom = Corpus.linear.fromWords(pn,[["Anger","Fear"],["Anger","Fear","Negativity"],["OralCues","Numbers","PersonProperty"]])
manualCom.terms = ["Ang+Fear","Ang+Fear+Neg","Top3Com"]
manualCom.aucTable(genres,[0,1,2])

            :      CO       TR       TC       HI 
    Ang+Fear:   0.150    0.830    0.623    0.555 
Ang+Fear+Neg:   0.179    0.836    0.579    0.522 
     Top3Com:   0.753    0.249    0.350    0.632

PCA¶

pca = Corpus.linear.PCA(pn,10)
Plays.uplot(pca,fname="core_docuscope_pca")
pca.aucTable(genres,[0,1,2])

Build PCA model in  0.01

      :      CO       TR       TC       HI 
PCA(0):   0.268    0.751    0.484    0.648 
PCA(1):   0.364    0.596    0.692    0.308 
PCA(2):   0.470    0.438    0.515    0.774

So, PCA is pretty good at telling tragedy from comedy - but not nearly as good as fear or anger by themselves. Put anger and fear together, and it's much better.

rca = Corpus.linear.RCA(pn,genres)
Plays.uplot(rca,fname="core_docuscope_rca",toggles=rca.topWords(0))
rca.aucTable(genres,[0,1,2])

Build RCA model in  0.04

      :      CO       TR       TC       HI 
RCA(0):   0.065    0.910    0.561    0.767 
RCA(1):   0.429    0.638    0.650    0.022 
RCA(2):   0.620    0.647    0.134    0.386

rca_tsne = Corpus.nonlinear.tsne(rca)
Plays.uplot(rca_tsne,fname="core_docuscope_rca_tsne")

Build TSNE model in  4.09

rca.topWords(0)

[('MoveBody', 590.90531110965253),
 ('Neg_Citation', 574.46364505122904),
 ('Future_in_Past', 430.33886852614728),
 ('Example', 393.04904513992318),
 ('FollowUp', 357.90872502972809),
 ('Apology', 347.75513494865982),
 ('Anger', 229.98953552497338),
 ('Fear', 224.51047451243585),
 ('Feedback', -228.70357936196586),
 ('OpenQuery', -235.39848168913755),
 ('Positive_Attribution', -292.08792175064468),
 ('ReceivedPOV', -322.03962180020051),
 ('Contested_Citation', -337.61282256499101),
 ('Definition', -348.53321450650299),
 ('PriorKnowledge', -362.75866852191314),
 ('ConfirmExperience', -363.67734104409436),
 ('Precedent_Defending', -371.22332169865888),
 ('DirectReasoning', -443.63692589830708),
 ('MatureProcess', -468.55745468204924),
 ('CommunicatorRole', -555.80113850299381)]

rca.topWords(1)

[('CommunicatorRole', 705.37332357250898),
 ('Precedent_Setting', 491.6801439215954),
 ('PriorKnowledge', 437.480445827354),
 ('Future_in_Past', 398.65506721957416),
 ('Example', 368.94657850493905),
 ('SelfReluctance', 328.36091234038884),
 ('Precedent_Defending', 300.32150248096161),
 ('Innovations', 280.02240765761746),
 ('Promise', 256.87749636471239),
 ('MatureProcess', 220.3675324222678),
 ('Positive_Attribution', 209.04805383881515),
 ('Repair_Citation', -232.58742547315447),
 ('SceneShift', -234.27576643001208),
 ('Responsibility', -243.64953014510266),
 ('ReceivedPOV', -288.63341082477774),
 ('Feedback', -309.69967790860005),
 ('DirectReasoning', -331.62489334995945),
 ('Reassure', -345.6218434718067),
 ('TimeDate', -503.60267207263564),
 ('Contested_Citation', -757.86460646352145)]

rcan = rca.topNCol(10)
Plays.uplot(rcan,fname="core_docuscope_rcaN10")
rcan.aucTable(genres,[0,1,2])

      :      CO       TR       TC       HI 
RCA(0):   0.374    0.662    0.425    0.648 
RCA(1):   0.471    0.589    0.516    0.306 
RCA(2):   0.487    0.614    0.309    0.612

rcan = rca.topNCol(20)
Plays.uplot(rcan,fname="core_docuscope_rcaN20")
rcan.aucTable(genres,[0,1,2])

      :      CO       TR       TC       HI 
RCA(0):   0.153    0.851    0.566    0.603 
RCA(1):   0.471    0.600    0.549    0.197 
RCA(2):   0.480    0.623    0.300    0.632

Before and After¶

dix = [1 if d<1590 else 2 if d>1613 else 0 for d in dates]
print(Counter(dix))
dixRCA = Corpus.linear.RCA(pn,dix,skipZeros=True)
Plays.uplot(dixRCA,grpCol=dix,nameGroupCol="BeforeAfter",fname="core_docuscope_before_after_shak")
Plays.uplot(dixRCA,fname="core_docuscope_before_after_shak_genre")

Counter({2: 282, 0: 216, 1: 56})
Build RCA model in  0.03

dixRCA.topWords(0)

[('Authoritative_Citation', 430.26477171981134),
 ('Future_in_Past', 429.63116980704763),
 ('Support', 296.52340157702849),
 ('Contested_Citation', 237.22815407866423),
 ('TimeDate', 229.11422568311181),
 ('Negative_Attribution', 220.80337126456291),
 ('Attack_Citation', 207.257229226069),
 ('Feedback', 176.19837609404661),
 ('MoveBody', -172.56607571897229),
 ('Confront', -207.69394886842787),
 ('ConfirmExperience', -216.3845976192969),
 ('Positive_Attribution', -224.04686126030606),
 ('ConfirmedThght', -241.8015657115223),
 ('MatureProcess', -282.49394997706145),
 ('Neg_Citation', -336.26687260181615),
 ('Precedent_Defending', -365.70889610338247),
 ('Promise', -388.47787410504168),
 ('Substitution', -696.24758706108116),
 ('Innovations', -726.98582225698067),
 ('Repair_Citation', -749.56385532974639)]

Authorship¶

## get the top N authors 
authorCount = Counter(authors)
Plays.uplot(pca,grpCol="author",onlyGroups=authorCount.most_common(6),fname="core_docuscope_author_pca")

authorToSeparate = [ a[0] for a in authorCount.most_common(7) if a[0] != "Anon."]
authorNumDict = { a:i+1 for i,a in enumerate(authorToSeparate)}
authorNums = [authorNumDict[a]+1 if a in authorNumDict else 0 for a in authors]
authorRCA = Corpus.linear.RCA(pn,authorNums,skipZeros=True)
Plays.uplot(authorRCA,grpCol="author",onlyGroups=authorToSeparate,fname="core_docuscope_author_rca.csv")
Plays.uplot(authorRCA,grpCol="author",onlyGroups=authorCount.most_common(13)[7:],fname="core_docuscope_author_rca_nt.csv")

Build RCA model in  0.05

authorTSNE = Corpus.nonlinear.tsne(authorRCA)
Plays.uplot(authorTSNE,grpCol="author",onlyGroups=authorToSeparate,fname="core_docuscope_author_rca_tsne.csv")

Build TSNE model in  4.41