TF-IDF Example source: https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

# import libraries
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
# create text documents
D1 = 'Text mining is fun'
D2 = 'I love Text mining yea'
# Converting to Lowercase
D1 = D1.lower()
D2 = D2.lower()
print(D1)
print(D2)
# Tokenize
BOW_D1 = D1.split(' ')
BOW_D2 = D2.split(' ')
print(BOW_D1)
print(BOW_D2)

Create a dictionary of all words (unique words)

dic = set(BOW_D1).union(set(BOW_D2))
dic

Compute Term Frequency (TF)

word_count_D1 = dict.fromkeys(dic, 0) # create a vector with zeros for doc D1
# count word frequency in D1
for word in BOW_D1: 
    word_count_D1[word] += 1

word_count_D2 = dict.fromkeys(dic,0) # create a vector with zeros for doc D2
# count word frequency in D1
for word in BOW_D2:
    word_count_D2[word] += 1
print(word_count_D1)
print(word_count_D2)

Create TF function

def TF(word_count, BOW):
  k = len(BOW) # number of words in a document
  tfDict = {}
  for word, count in word_count.items():
      tfDict[word] = count / float(k)
  return tfDict
tf_D1 = TF(word_count_D1, BOW_D1)
tf_D2 = TF(word_count_D2, BOW_D2)
print(tf_D1)
print(tf_D2)

Create IDF function

import math
def IDF(documents):
  N = len(documents)
  idfDict = dict.fromkeys(documents[0].keys(), 0) # create a vector with zeros
  for document in documents:
      for word, val in document.items():
          if val > 0:
              idfDict[word] += 1
  
  for word, val in idfDict.items():
      idfDict[word] = math.log(N / float(val))
  return idfDict
idfs = IDF([word_count_D1, word_count_D2])
idfs

Create TF X IDF functions

def TFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

Compute TF X IDF vectors

import pandas as pd
tfidf_D1 = TFIDF(tf_D1, idfs)
tfidf_D2 = TFIDF(tf_D2, idfs)
df = pd.DataFrame([tfidf_D1, tfidf_D2])
df

Word2Vec Example#

from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings
import pandas as pd
import gensim 
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
import requests
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/giacomomarino/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

Download the Enrichr paper from PubMed

url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4987924&retmode=json&email=my_email@example.com'
r = requests.get(url, allow_redirects=True)
open('data.xml', 'wb').write(r.content)
100442
#  Reads data.xml’ file 
sample = open("data.xml", "r") 
s = sample.read() 
import pubmed_parser

res = pubmed_parser.parse_pubmed_xml('data.xml')

text = res['abstract']
data = [] 
# iterate through each sentence in the file 
for i in sent_tokenize(text): 
  temp = [] 
  # tokenize the sentence into words 
  for j in word_tokenize(i): 
      temp.append(j.lower()) 
  data.append(temp)

print(data)
[['enrichment', 'analysis', 'is', 'a', 'popular', 'method', 'for', 'analyzing', 'gene', 'sets', 'generated', 'by', 'genome-wide', 'experiments', '.'], ['here', 'we', 'present', 'a', 'significant', 'update', 'to', 'one', 'of', 'the', 'tools', 'in', 'this', 'domain', 'called', 'enrichr', '.'], ['enrichr', 'currently', 'contains', 'a', 'large', 'collection', 'of', 'diverse', 'gene', 'set', 'libraries', 'available', 'for', 'analysis', 'and', 'download', '.'], ['in', 'total', ',', 'enrichr', 'currently', 'contains', '180', '184', 'annotated', 'gene', 'sets', 'from', '102', 'gene', 'set', 'libraries', '.'], ['new', 'features', 'have', 'been', 'added', 'to', 'enrichr', 'including', 'the', 'ability', 'to', 'submit', 'fuzzy', 'sets', ',', 'upload', 'bed', 'files', ',', 'improved', 'application', 'programming', 'interface', 'and', 'visualization', 'of', 'the', 'results', 'as', 'clustergrams', '.'], ['overall', ',', 'enrichr', 'is', 'a', 'comprehensive', 'resource', 'for', 'curated', 'gene', 'sets', 'and', 'a', 'search', 'engine', 'that', 'accumulates', 'biological', 'knowledge', 'for', 'further', 'biological', 'discoveries', '.'], ['enrichr', 'is', 'freely', 'available', 'at', ':', 'http', ':', '//amp.pharm.mssm.edu/enrichr', '.']]
# Create CBOW model (how to do skip-gram?)
model1 = gensim.models.Word2Vec(data, min_count = 1, max_vocab_size =300, window = 5)
print(model1)
Word2Vec<vocab=84, vector_size=100, alpha=0.025>
# Print results 
print("Cosine similarity between 'cell' and 'receptor' - Cbow: ") 
model1.wv.similarity('enrichr', 'visualization')
Cosine similarity between 'cell' and 'receptor' - Cbow: 
0.056125656
model1.wv.similar_by_word("enrichr",topn=4)
[('collection', 0.19100971519947052),
 ('significant', 0.18929652869701385),
 ('comprehensive', 0.1846179962158203),
 ('analysis', 0.16197505593299866)]
data = [['bulk', 'rna', 'sequenc', 'of', 'microglia', 'in', 'xtg', 'ad', 'mice', 'treat', 'with', 'anti', 'nk', 'deplet', 'antibodi', 'or', 'isotyp', 'control', 'the', 'goal', 'of', 'thi', 'experi', 'is', 'to', 'determin', 'the', 'effect', 'of', 'nk', 'cell', 'on', 'microglia', 'inflamm', 'in', 'xtg', 'ad', 'mice', 'we', 'examin', 'the', 'transcriptom', 'chang', 'of', 'microglia', 'in', 'mice', 'treat', 'with', 'anti', 'nk', 'anitbodi', 'and', 'isotyp', 'control', 'by', 'deep', 'rna', 'seq', 'we', 'deplet', 'nk', 'cell', 'in', 'xtg', 'ad', 'mice', 'use', 'anti', 'nk', 'deplet', 'antibodi', 'we', 'then', 'sort', 'microglia', 'by', 'flouoresc', 'activ', 'cell', 'sort', 'rna', 'seq', 'wa', 'perform', 'with', 'the', 'bulk', 'microglia', 'popul']]
words=data[0]
import nltk
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import networkx as nx

word_fd = nltk.FreqDist(words)
bigram_fd = nltk.FreqDist(nltk.bigrams(words))

res = [ [ x[0][0], x[0][1], x[1] ] for x in bigram_fd.most_common()]
res = pd.DataFrame(res, columns=['frm', 'to', 'weight'])
res = res.groupby(['frm', 'to']).agg({'weight': ['sum']})
res.reset_index(inplace=True)
res.columns = ['frm', 'to', 'weight']
G= nx.from_pandas_edgelist(res, 'frm', 'to', ['weight'])
G.size()
70
try:
    n=nx.shortest_path_length(G,"control","isotyp")
    print(n)
except nx.NetworkXNoPath:
    print('No path')
1