In these experiments, I will repeat what we did with the Core Drama notebook, but this time using the Docuscope tag counts as features, rather than the 200 most common words
# this setup is meant to be common across all the demos...
# since the notebooks may not go where the module code is...
# change directory to the root - since that's where everything goes
import sys
import os
if os.path.basename(os.getcwd()) == "Notebooks":
print("Changing directory from:",os.getcwd())
os.chdir("..")
print(" to:",os.getcwd())
### Magic to make notebooks work nicely
%matplotlib inline
### standard useful python stuff
from collections import Counter, defaultdict
from importlib import reload
### plotting stuff
import pylab
import seaborn
### numerical stuff
import numpy
from scipy import stats
### my CorpusModeler package stuff
import Corpus.DataMatrix as DM
import Corpus.reduce
import Corpus.cull
import Corpus.linear
import Corpus.nonlinear
import Tests.plays as Plays
p = Plays.readDScore()
p.columns["date"] = p.getMetaDataArray("date of writing")
Plays.addNames(p)
pn = p.normalize()
genres = p.getMetaDataArray("genre")
dates = p.getMetaDataArray("date")
authors = p.getMetaDataArray("author")
counts = pn.wordCounts()
countOrd = numpy.argsort(counts)[::-1]
[pn.terms[i] for i in countOrd[:5]]
Plays.uplot(pn,xc=countOrd[0],yc=countOrd[1],oc=[countOrd[i] for i in [2,3,4]],fname="core_docuscope_mostCommon",toggles=["DirectAddress","SenseObject"])
pn.aucTable(genres,countOrd[:5])
The most common LATs (unlike the most common words) aren't great for distinguishing comedies, let's see which LATs are best (have the AUC farthest from .5) for picking out comedies.
com = [1 if g=="CO" else 0 for g in genres]
auc = pn.auc(com)
aucMag = [abs(a-.5) for a in auc]
aucOrd = numpy.argsort(aucMag)[::-1]
pn.aucTable(genres,aucOrd[:10])
reload(Plays.PL)
Plays.uplot(pn,xc="Anger",yc="OralCues",oc=["Fear","Negativity","Numbers","CommonAuthorities"],fname="core_docuscope_bestCom",toggles=["Anger","OralCues"])
manualCom = Corpus.linear.fromWords(pn,[["Anger","Fear"],["Anger","Fear","Negativity"],["OralCues","Numbers","PersonProperty"]])
manualCom.terms = ["Ang+Fear","Ang+Fear+Neg","Top3Com"]
manualCom.aucTable(genres,[0,1,2])
pca = Corpus.linear.PCA(pn,10)
Plays.uplot(pca,fname="core_docuscope_pca")
pca.aucTable(genres,[0,1,2])
So, PCA is pretty good at telling tragedy from comedy - but not nearly as good as fear or anger by themselves. Put anger and fear together, and it's much better.
rca = Corpus.linear.RCA(pn,genres)
Plays.uplot(rca,fname="core_docuscope_rca",toggles=rca.topWords(0))
rca.aucTable(genres,[0,1,2])
rca_tsne = Corpus.nonlinear.tsne(rca)
Plays.uplot(rca_tsne,fname="core_docuscope_rca_tsne")
rca.topWords(0)
rca.topWords(1)
rcan = rca.topNCol(10)
Plays.uplot(rcan,fname="core_docuscope_rcaN10")
rcan.aucTable(genres,[0,1,2])
rcan = rca.topNCol(20)
Plays.uplot(rcan,fname="core_docuscope_rcaN20")
rcan.aucTable(genres,[0,1,2])
dix = [1 if d<1590 else 2 if d>1613 else 0 for d in dates]
print(Counter(dix))
dixRCA = Corpus.linear.RCA(pn,dix,skipZeros=True)
Plays.uplot(dixRCA,grpCol=dix,nameGroupCol="BeforeAfter",fname="core_docuscope_before_after_shak")
Plays.uplot(dixRCA,fname="core_docuscope_before_after_shak_genre")
dixRCA.topWords(0)
## get the top N authors
authorCount = Counter(authors)
Plays.uplot(pca,grpCol="author",onlyGroups=authorCount.most_common(6),fname="core_docuscope_author_pca")
authorToSeparate = [ a[0] for a in authorCount.most_common(7) if a[0] != "Anon."]
authorNumDict = { a:i+1 for i,a in enumerate(authorToSeparate)}
authorNums = [authorNumDict[a]+1 if a in authorNumDict else 0 for a in authors]
authorRCA = Corpus.linear.RCA(pn,authorNums,skipZeros=True)
Plays.uplot(authorRCA,grpCol="author",onlyGroups=authorToSeparate,fname="core_docuscope_author_rca.csv")
Plays.uplot(authorRCA,grpCol="author",onlyGroups=authorCount.most_common(13)[7:],fname="core_docuscope_author_rca_nt.csv")
authorTSNE = Corpus.nonlinear.tsne(authorRCA)
Plays.uplot(authorTSNE,grpCol="author",onlyGroups=authorToSeparate,fname="core_docuscope_author_rca_tsne.csv")