Bugs that i had faced: 2019

Friday, November 29, 2019

MSE vs Cross Entropy

MSE happens when you assume the error follows Normal Distribution and cross-entropy when you assume binomial distribution.

Monday, November 11, 2019

https://www.pyimagesearch.com/2019/11/11/detecting-natural-disasters-with-keras-and-deep-learning/?utm_source=facebook&utm_medium=ad-11-11-2019&utm_campaign=11+November+2019+BP+-+Traffic&utm_content=Default+name+-+Traffic&fbid_campaign=6128556144046&fbid_adset=6128556297446&utm_adset=11+November+2019+BP+-+Email+List+-+Worldwide+-+18%2B&fbid_ad=6128556297646

Tuesday, July 9, 2019

python receive text and predict using trained model

#!/usr/bin/env python
from keras.models import load_model
from keras.layers.core import Reshape, Flatten
from keras.callbacks import ModelCheckpoint
#from data_helpers import load_data
from keras.optimizers import Adam
from keras.models import Model
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report
from keras.layers.merge import Concatenate
from sklearn.model_selection import train_test_split
import numpy as np
import re
import sys
def clean_str(string):
"""
Tokenization/string cleaning for datasets.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"$", " \( ", string)
string = re.sub(r"$", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def pad_sentences(sentences, padding_word="<PAD/>"):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
#print len(sentences)
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = 85 - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences

#print ('Loading data')
#x, y, vocabulary, vocabulary_inv = load_data()
#x="@MoriTaheripour shut up nigger whore! Hope u get raped by one of those animals. Might change your tune."
x = sys.argv[1]
#print x
x_text = [clean_str(x)]
#print (x_text)
x_text = [s.split(" ") for s in x_text]
#print x_text
sentences_padded = pad_sentences(x_text)
#print (sentences_padded)
vocabulary = np.load('data123-vocab-servertest2.npy').item()
#for word in sentences_padded:
# for word2 in word:
# print vocabulary[word2]

x2 = np.array([[vocabulary[word2] for word2 in word]for word in sentences_padded] )
#print x2
#X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)
#print type(vocabulary)
#np.save('data1-vocab.npy', vocabulary)
#sequence_length = x.shape[1]
#vocabulary_size = len(vocabulary_inv)
#embedding_dim = 256
#filter_sizes = [3,4,5]
#num_filters = 512
#drop = 0.5

#nb_epoch = 10
#batch_size = 30
a
model = load_model('cnn2D-data123-multi-servertest2.hdf5')
#print(X_test)
y_pred = model.predict(x2)
cc=['Hate','Offensive','Neutral']

#f = open("outputfile.csv","w+")
for xx in y_pred:
count=0
for yy in xx:
print (cc[count] +" " + str(format(yy*100,'.2f')) + "%")
#f.write(cc[count] +", " + str(format(yy*100,'.2f')) + "%")
count = count + 1
#f.close()

#y_pred = y_pred.round()
#y_pred = y_pred.astype('int')
#print(y_pred)
#print(y_test)
#y_pred1 = model.predict(X_test).argmax(axis=1)
#y_test1 = y_test.argmax(axis =1)
#print(y_pred1)
#print(y_test1)
#score = model.evaluate(X_test, y_test,verbose=1)
#report = classification_report( y_test1, y_pred1 )
#print(report)
#cm = confusion_matrix(y_test1, y_pred1)
#print(cm)
#precision_score(y_test, y_pred)
#print(score)

word2vec test

aimport gensim.models.keyedvectors as word2vec
import numpy as np

model=word2vec.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin',binary=True)
def get_score(tokens,ground_truth):
sc2=[]
for tk in tokens:
i =0
sc=[]
for tk_2 in ground_truth:
sim =model.similarity(tk,tk_2)
#print(i, sim)
sc.append(sim)
i = i + 1
sc=np.array(sc)
sc2.append(np.max(sc,axis=0))
return sc2
def get_softP(pred,tokens,tokens_scores):
i=1
sum1=0
for (tk,tk_score) in zip(tokens,token_scores):
#print (tk, tk_score)
if tk in pred:
sum1= sum1 + tk_score
i = i +1
print (tk, tk_score)
#print ("Sum is ",sum1)
#print sum1/i
return sum1/i

def get_softR(pred,ground_truth,tokens,tokens_scores):

sum1=0
for (tk,tk_score) in zip(tokens,token_scores):
#print (tk, tk_score)
if tk in pred:
if tk in ground_truth:
sum1= sum1 + tk_score
print (tk, tk_score)
#print ("Sum is ",sum1)
#print sum1/i
return sum1/len(ground_truth)

ground_truth=['Finland','University','UEF','Joensuu']
tokens = ['Finland','Departmet','School','Computing','University', 'UEF', 'Science', 'Park', 'Joensuu']
pred=['Finland','School','Park','Joensuu','Computing']

token_scores= get_score(tokens,ground_truth)
get_softP(pred,tokens,token_scores)
get_softR(pred,ground_truth,tokens,token_scores)

getting byte image (byte code from malware detection

from math import log
import numpy as np

def byte_make_image(byte_code):
img_array=[]
for row in byte_code:
xx=row.split()
if len(xx)!=17:
continue
img_array.append([int(i,16) if i!='??' else 0 for i in xx[1:] ])
img_array = np.array(img_array)
if img_array.shape[1]!=16:
assert(False)
b=int((img_array.shape[0]*16)**(0.5))
b=2**(int(log(b)/log(2))+1)
a=int(img_array.shape[0]*16/b)
img_array=img_array[:a*b/16,:]
img_array=np.reshape(img_array,(a,b))
#img_array = np.uint8(img_array)
#im = Image.fromarray(img_array)
return img_array

img = byte_make_image(byte_code)

hate php file

<!DOCTYPE html>
<html>
<title>Hate Speech Detection</title>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css">
<link rel="stylesheet" href="https://www.w3schools.com/lib/w3-theme-blue-grey.css">
<link rel='stylesheet' href='https://fonts.googleapis.com/css?family=Open+Sans'>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
<style>
html,body,h1,h2,h3,h4,h5 {font-family: "Open Sans", sans-serif}
textarea {
width: 100%;

}
</style>
<body class="w3-theme-l5">


<div class="w3-top">
<div class="w3-bar w3-theme-d2 w3-left-align w3-large">
<a class="w3-bar-item w3-button w3-hide-medium w3-hide-large w3-right w3-padding-large w3-hover-white w3-large w3-theme-d2" href="javascript:void(0);" onclick="openNav()"><i class="fa fa-bars"></i></a>
<a href="#" class="w3-bar-item w3-button w3-padding-large w3-theme-d4"><i class="fa fa-home w3-margin-right"></i>Hate Speech Detection</a>

<a href="#" class="w3-bar-item w3-button w3-hide-small w3-right w3-padding-large w3-hover-white" title="My Account">
<img src="mypic.jpg" class="w3-circle" style="height:30px;width:30px" alt="Avatar">
</a>
</div>
</div>


<div id="navDemo" class="w3-bar-block w3-theme-d2 w3-hide w3-hide-large w3-hide-medium w3-large">
<a href="#" class="w3-bar-item w3-button w3-padding-large">Link 1</a>
<a href="#" class="w3-bar-item w3-button w3-padding-large">Link 2</a>
<a href="#" class="w3-bar-item w3-button w3-padding-large">Link 3</a>
<a href="#" class="w3-bar-item w3-button w3-padding-large">Muhammad U. S. Khan</a>
</div>


<div class="w3-container w3-content" style="max-width:1400px;margin-top:80px">

<div class="w3-row">

<div class="w3-col m3">

<div class="w3-card w3-round w3-white">
<div class="w3-container">
<h4 class="w3-center">Muhammad U. S. Khan</h4>
<p class="w3-center"><img src="mypic.jpg" class="w3-circle" style="height:106px;width:106px" alt="Avatar"></p>
<hr>
<p><i class="fa fa-pencil fa-fw w3-margin-right w3-text-theme"></i> Postdoc Researcher, UEF</p>
<p><i class="fa fa-home fa-fw w3-margin-right w3-text-theme"></i> Joensuu, Finland</p>

</div>
</div>
<br>


<div class="w3-card w3-round">
<div class="w3-white">
<button onclick="myFunction('Demo1')" class="w3-button w3-block w3-theme-l1 w3-left-align"><i class="fa fa-circle-o-notch fa-fw w3-margin-right"></i> My Groups</button>
<div id="Demo1" class="w3-hide w3-container">
<p>Machine learning group</p>
</div>
<button onclick="myFunction('Demo2')" class="w3-button w3-block w3-theme-l1 w3-left-align"><i class="fa fa-calendar-check-o fa-fw w3-margin-right"></i> My Events</button>
<div id="Demo2" class="w3-hide w3-container">
<p>Something big in the future</p>
</div>

</div>
</div>
<br>


<div class="w3-card w3-round w3-white w3-hide-small">
<div class="w3-container">
<p>Interests</p>
<p>
<span class="w3-tag w3-small w3-theme-d5">Machine learning</span>
<span class="w3-tag w3-small w3-theme-d4">Artificial Intelligence</span>
<span class="w3-tag w3-small w3-theme-d3">Big data</span>
<span class="w3-tag w3-small w3-theme-d2">Social media analysis</span>
</p>
</div>
</div>
<br>




</div>


<div class="w3-col m7">

<div class="w3-row-padding">
<div class="w3-col m12">
<div class="w3-card w3-round w3-white">
<div class="w3-container w3-padding">
<h6 class="w3-opacity">Hate Speech Detection</h6>
<form action="index.php">
<textarea class="w3-border w3-padding" rows="4" cols="50" name="nString" >Enter Text</textarea>

<input type="submit" class="w3-button w3-theme" value="Check hate probability">
</form>
</div>
</div>
</div>
</div>

<div class="w3-container w3-card w3-white w3-round w3-margin"><br>

<span class="w3-right w3-opacity">1 min</span>
<h4>Result</h4><br>
<hr class="w3-clear">
<p>
<?php

if(isset($_GET['nString'])){
//#print $_GET["nString"]; print "<br>";
$str = $_GET["nString"];
$retval = NULL;
$output = NULL;

exec("python3 /home/tko/usman/web-docs/hate/cnn2d-model2-dataset1-loadvocab-2.py $str 2>&1",$output, $retval);

print '<table align="center">';
$length = count($output);
for ($i = 1; $i < $length; $i++) {
print '<tr><td>';
print $output[$i];
print "<br>";
print '</td></tr>';
}
print '</table >';
}
else{
print ""
}
?>

</p>
<div class="w3-row-padding" style="margin:0 -16px">

</div>

</div>


</div>


<div class="w3-col m2">
<div class="w3-card w3-round w3-white ">
<div class="w3-container">
<p>Tool is based on a neural network model</p>
<p><strong>that find probabilities of</strong></p>
<p>hate, offensive, and neutrality of any given english language tweet</p>

</div>
</div>
<br>

<br>




</div>


</div>


</div>
<br>


<footer class="w3-container w3-theme-d3 w3-padding-16">

</footer>

<footer class="w3-container w3-theme-d5">

</footer>

<script>
// Accordion
function myFunction(id) {
var x = document.getElementById(id);
if (x.className.indexOf("w3-show") == -1) {
x.className += " w3-show";
x.previousElementSibling.className += " w3-theme-d1";
} else {
x.className = x.className.replace("w3-show", "");
x.previousElementSibling.className =
x.previousElementSibling.className.replace(" w3-theme-d1", "");
}
}

// Used to toggle the menu on smaller screens when clicking on the menu button
function openNav() {
var x = document.getElementById("navDemo");
if (x.className.indexOf("w3-show") == -1) {
x.className += " w3-show";
} else {
x.className = x.className.replace(" w3-show", "");
}
}
</script>

</body>
</html>

word2vec server

import gensim.models.keyedvectors as word2vec
import numpy as np
import socket
import sys

model=word2vec.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin',binary=True)

# Create a TCP/IP socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_address = ('localhost', 10004)
print (sys.stderr, 'starting up on %s port %s' % server_address)
sock.bind(server_address)
sock.listen(1)

while True:
# Wait for a connection
print (sys.stderr, 'waiting for a connection')
connection, client_address = sock.accept()
try:
print (sys.stderr, 'connection from', client_address)

# Receive the data in small chunks and retransmit it
while True:
data = connection.recv(1001)
print (sys.stderr, 'received ', data)
if data:
print (sys.stderr, 'sending data back to the client')
data = data.decode("utf-8")
words = data.split(',')
try:
result=model.similarity(words[0],words[1])
except KeyError:
result=-1
result = str(result) + "\n"
print(str(result))
connection.sendall(bytes(result, "utf-8"))
else:
print (sys.stderr, 'no more data from', client_address)
break

finally:
# Clean up the connection
connection.close()

plots with rotation in text

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#reading the data

test_data = pd.read_csv('recall.csv')

print(test_data)

#print(test_data['Sensor'])
y_pos = np.arange(len(test_data['Sensors']))
plt.bar(y_pos,test_data['Value'], align='center', alpha=1,color='black')
plt.xticks(y_pos, test_data['Sensors'],rotation=45)
plt.ylim(.85,1.0)
plt.show()

Frequency percentage calculation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#reading the data

test_data = pd.read_csv('testdata.csv')

sorted_data=test_data.sort_values(by='Data',ascending=True)

actual_frequencey_percentage=[]
prev= 0
totaldata= sorted_data.count()

cd=1/totaldata
print cd
for row in sorted_data.iterrows():
print row[1][0]
new = cd + prev
prev = new
actual_frequencey_percentage.append(new)

print(actual_frequencey_percentage)

plt.plot(sorted_data, actual_frequencey_percentage)
plt.show()

alpha evaluation for multilabel

from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#reading the data

def alpha_evaluation(predicted_df,test_df,alpha=0,beta=1,gamma=1):
y_pred=predicted_df
y_test = test_df
y_pred[y_pred>=0.5] = 1
y_pred[y_pred<0.5] = 0

Mx=0 # number of misclassified
Fx=0 # number of false predicted

#print type(y_pred)
#print len(y_pred.index)
#print y_pred
#print y_test
#alpha=0
#beta=1
#gamma =1
pred_class = [0] * len(y_pred.columns)
test_class = [0] * len(y_test.columns)
pscore_class = [0] * len(y_test.columns)
rscore_class = [0] * len(y_test.columns)
score_row = [0] * len(y_pred.index)

#print ("pred_class is ", pred_class)
for i in range(0, len(y_pred.index)):
Mx=0
Fx=0
Sx =0
Cx=0
score =0
selected_class=0
# print("Row is ",i)
for j in range(0, len(y_pred.columns)):
#print ("Column is ",j)
if y_pred.values[i,j] == 1:
pred_class[j] +=1
selected_class = j
if y_test.values[i,j] == 1:
test_class[j] +=1
#print (y_pred.values[i,j], y_test.values[i,j])
if y_pred.values[i,j] == 0 and y_test.values[i,j] == 1:
Mx += 1
#print ("mx")
elif y_pred.values[i,j] == 1 and y_test.values[i,j] == 0:
Fx += 1
#print ("fx")
elif y_pred.values[i,j] == 1 and y_test.values[i,j] == 1:
Sx +=1
#print ("Sx")
if y_pred.values[i,j] == 1 or y_test.values[i,j] == 1:
Cx +=1
#print ("Cx")
print (Mx, Fx,Sx,Cx)

if Sx == 0:
score =0
else:
temp = 1 - (((beta * Mx) + (gamma *Fx))/Cx)
#print("fx is ",Fx)
#print("gama into fx and divided by 2 is ",(gamma * Fx)/2)
#print ("temp is ",temp)
score = pow(temp,alpha)
#print ("score is ",score)

score_row[i] +=score
for i in range(0, len(y_pred.index)):
for j in range(0, len(y_pred.columns)):
if y_pred.values[i,j] == 1:
pscore_class[j] += score_row[i]
if y_test.values[i,j] == 1:
rscore_class[j] += score_row[i]
#print ("pscore_class is ", pscore_class)
#print ("rscore_class is ", rscore_class)
#print ("pred_class is ", pred_class)
#print ("test_class is ", test_class)

print(" class, precision, recall")
for j in range(0, len(y_pred.columns)):
print(j, pscore_class[j]/pred_class[j], rscore_class[j]/test_class[j])
del y_pred
del y_test

y_pred1 = pd.read_csv('y_pred1.csv',header=None)
y_test1 = pd.read_csv('y_test.csv',header=None)

alpha_evaluation(y_pred1,y_test1,alpha=1,beta = 1/4)

distance between vectors

v2 = np.loadtxt("myvector2.txt")
def euclidean_dist(vec1,vec2):
return np.sqrt(np.sum((vec1-vec2)**2))

def find_closest(word_index, vectors):
min_dist = 100000
min_index = -1
query_vector = vectors[word_index]
for index, vector in enumerate(vectors):
if euclidean_dist(vector, query_vector)< min_dist and not np.array_equal(vector, query_vector):
min_dist = euclidean_dist(vector, query_vector)
min_index = index
return min_index

print(int2diag[find_closest(diag2int['I63.4'],v2)])
print(int2diag[find_closest(diag2int['F71.1'],v2)])
print(int2diag[find_closest(diag2int['R00.2'],v2)])

euclidean_dist(v2[diag2int['I63.4']],v2[diag2int['S82.9']])