Import packages

In [1]:
import pandas as pd
import numpy as np
import urllib.request
import urllib.response
import sys
import os, glob
import http.client, urllib
import json
import re
import html
from bs4 import BeautifulSoup
from itertools import zip_longest
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

The folder with exported Telegram chat (Use Telegram Desktop windows app for export): image.png

In [2]:
filepath = "C:\\temp\\telegram_chat\\*.html"
In [3]:
#DataFrame to store messages
DF=pd.DataFrame(columns=['Author','text'])
#iterate files and add all messages to the DataFrame
len_all=0
count = 1
for file_index, file in enumerate(glob.glob(filepath)):
        f = open(file,"r",encoding="utf8")
        s=f.read()
        soup = BeautifulSoup(s, 'html.parser')
        count+= 1
        author="nobody"
    
        for index, row in enumerate(soup.find_all(attrs={"class": ['from_name', 'text']})):
            if ('from_name' in str(row)): author=row.contents[0].replace('\n','')
            DF.loc[index+len_all,'Author'] = author
            DF.loc[index+len_all,'text'] = row.get_text().replace('\n',' ').replace('\xa0',' ')
        len_all=len_all+len(soup.find_all(attrs={"class": ['from_name', 'text']}))
In [4]:
#Data cleaning: stripping leading and trailing spaces, removing empty and forwarded messages
DF["Author"]=DF["Author"].str.strip()
DF["text"]=DF["text"].str.strip()
DF=DF[(DF["Author"]!=DF["text"])]
DF=DF.groupby("Author").filter(lambda x: len(x) > 100)
#authors list
authors=np.array(DF[["Author"]].drop_duplicates()["Author"].str.strip())
In [5]:
DF.groupby("Author").count().sort_values(by="text", ascending=False)
Out[5]:
text
Author
Author1 340
Eldar 233
Author2 161
Author3 134
Author4 107

I changed the names of all chat participants except me in the output for privacy reasons.

Azure cognitive text analytics configuration.
Replace your accessKey (and url, if needed).
More details on that here.

In [6]:
accessKey = 'youraccesskeyhere'
url = 'westeurope.api.cognitive.microsoft.com'
path = '/text/analytics/v2.0/Sentiment'
 
def TextAnalytics(documents):
    headers = {'Ocp-Apim-Subscription-Key': accessKey}
    conn = http.client.HTTPSConnection(url)
    body = json.dumps (documents)
    conn.request ("POST", path, body, headers)
    response = conn.getresponse ()
    return response.read ()

Function to create a document from a message, we will send these documents for sentiment analysis.
Change the language if needed.

In [7]:
def getdocs(author):
    filtered=pd.DataFrame(DF[(DF["Author"]==author)])
    documents = { 'documents': []}
    count = 1
    for txt, index in enumerate(np.array(filtered["text"])):
            text = np.array(filtered["text"])[txt]
            documents.setdefault('documents').append({"language":"ru","id":str(count),"text":text}) # set language, in my case it's Russian
            count+= 1
    return documents     

There are limit on the maximum number of documents in a request - 1000 documents.
We will divide our messages into packages of 1000 documents with this function.
This is necessary if there are more than 1000 posts from at least one author.

In [8]:
def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

Sending documents for sentiment analysis: outer loop for authors, inner loop for documents batches.

In [9]:
json_result=[]
docs=[]
for authindex, author in enumerate(authors): 
    print(author)
    json_result.append(authindex)
    json_result[authindex]=[]
    docs.append(authindex)
    docs[authindex]=getdocs(author)
    for docindex, part in enumerate(list(grouper(docs[authindex]['documents'], 1000))):
        print("batch number: ", docindex)
        docs[authindex][docindex]={ 'documents': list(filter(None, list(grouper(docs[authindex]['documents'], 1000))[docindex]))}
        result = TextAnalytics (docs[authindex][docindex])
        json_result[authindex]=json_result[authindex]+json.loads(result)['documents']
Author1
batch number:  0
Eldar
batch number:  0
Author2
batch number:  0
Author3
batch number:  0
Author4
batch number:  0

Results dataframe: calculation of the average value for each author.

In [10]:
DF_results=pd.DataFrame(columns=['Author','Sentiment'])

for authindex, author in enumerate(authors):
    sentiment=0
    for index, doc in enumerate(json_result[authindex]):
        sentiment=sentiment+json_result[authindex][index]['score']
    sentiment=sentiment/len(json_result[authindex])
     
    DF_results.loc[authindex,'Author']=author
    DF_results.loc[authindex,'Sentiment']=sentiment

Results:

In [11]:
DF_results.sort_values(by='Sentiment', ascending=False)
Out[11]:
Author Sentiment
4 Eldar 0.5553
3 Author3 0.547984
2 Author4 0.525982
0 Author2 0.523
1 Author1 0.517575

It's nice to know that I am the most positive person among us 🙂