Import packages
import pandas as pd
import numpy as np
import urllib.request
import urllib.response
import sys
import os, glob
import http.client, urllib
import json
import re
import html
from bs4 import BeautifulSoup
from itertools import zip_longest
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
The folder with exported Telegram chat (Use Telegram Desktop windows app for export):
filepath = "C:\\temp\\telegram_chat\\*.html"
#DataFrame to store messages
DF=pd.DataFrame(columns=['Author','text'])
#iterate files and add all messages to the DataFrame
len_all=0
count = 1
for file_index, file in enumerate(glob.glob(filepath)):
f = open(file,"r",encoding="utf8")
s=f.read()
soup = BeautifulSoup(s, 'html.parser')
count+= 1
author="nobody"
for index, row in enumerate(soup.find_all(attrs={"class": ['from_name', 'text']})):
if ('from_name' in str(row)): author=row.contents[0].replace('\n','')
DF.loc[index+len_all,'Author'] = author
DF.loc[index+len_all,'text'] = row.get_text().replace('\n',' ').replace('\xa0',' ')
len_all=len_all+len(soup.find_all(attrs={"class": ['from_name', 'text']}))
#Data cleaning: stripping leading and trailing spaces, removing empty and forwarded messages
DF["Author"]=DF["Author"].str.strip()
DF["text"]=DF["text"].str.strip()
DF=DF[(DF["Author"]!=DF["text"])]
DF=DF.groupby("Author").filter(lambda x: len(x) > 100)
#authors list
authors=np.array(DF[["Author"]].drop_duplicates()["Author"].str.strip())
DF.groupby("Author").count().sort_values(by="text", ascending=False)
I changed the names of all chat participants except me in the output for privacy reasons.
Azure cognitive text analytics configuration.
Replace your accessKey (and url, if needed).
More details on that here.
accessKey = 'youraccesskeyhere'
url = 'westeurope.api.cognitive.microsoft.com'
path = '/text/analytics/v2.0/Sentiment'
def TextAnalytics(documents):
headers = {'Ocp-Apim-Subscription-Key': accessKey}
conn = http.client.HTTPSConnection(url)
body = json.dumps (documents)
conn.request ("POST", path, body, headers)
response = conn.getresponse ()
return response.read ()
Function to create a document from a message, we will send these documents for sentiment analysis.
Change the language if needed.
def getdocs(author):
filtered=pd.DataFrame(DF[(DF["Author"]==author)])
documents = { 'documents': []}
count = 1
for txt, index in enumerate(np.array(filtered["text"])):
text = np.array(filtered["text"])[txt]
documents.setdefault('documents').append({"language":"ru","id":str(count),"text":text}) # set language, in my case it's Russian
count+= 1
return documents
There are limit on the maximum number of documents in a request - 1000 documents.
We will divide our messages into packages of 1000 documents with this function.
This is necessary if there are more than 1000 posts from at least one author.
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
Sending documents for sentiment analysis: outer loop for authors, inner loop for documents batches.
json_result=[]
docs=[]
for authindex, author in enumerate(authors):
print(author)
json_result.append(authindex)
json_result[authindex]=[]
docs.append(authindex)
docs[authindex]=getdocs(author)
for docindex, part in enumerate(list(grouper(docs[authindex]['documents'], 1000))):
print("batch number: ", docindex)
docs[authindex][docindex]={ 'documents': list(filter(None, list(grouper(docs[authindex]['documents'], 1000))[docindex]))}
result = TextAnalytics (docs[authindex][docindex])
json_result[authindex]=json_result[authindex]+json.loads(result)['documents']
Results dataframe: calculation of the average value for each author.
DF_results=pd.DataFrame(columns=['Author','Sentiment'])
for authindex, author in enumerate(authors):
sentiment=0
for index, doc in enumerate(json_result[authindex]):
sentiment=sentiment+json_result[authindex][index]['score']
sentiment=sentiment/len(json_result[authindex])
DF_results.loc[authindex,'Author']=author
DF_results.loc[authindex,'Sentiment']=sentiment
Results:
DF_results.sort_values(by='Sentiment', ascending=False)
It's nice to know that I am the most positive person among us 🙂