Friday, November 29, 2019

MSE vs Cross Entropy

MSE happens when you assume the error follows Normal Distribution and cross-entropy when you assume binomial distribution.

Monday, November 11, 2019

Disaster detection

https://www.pyimagesearch.com/2019/11/11/detecting-natural-disasters-with-keras-and-deep-learning/?utm_source=facebook&utm_medium=ad-11-11-2019&utm_campaign=11+November+2019+BP+-+Traffic&utm_content=Default+name+-+Traffic&fbid_campaign=6128556144046&fbid_adset=6128556297446&utm_adset=11+November+2019+BP+-+Email+List+-+Worldwide+-+18%2B&fbid_ad=6128556297646

Tuesday, July 9, 2019

python receive text and predict using trained model

#!/usr/bin/env python
from keras.models import load_model
from keras.layers.core import Reshape, Flatten
from keras.callbacks import ModelCheckpoint
#from data_helpers import load_data
from keras.optimizers import Adam
from keras.models import Model
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report
from keras.layers.merge import Concatenate
from sklearn.model_selection import train_test_split
import numpy as np
import re
import sys
def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()
def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    #print len(sentences)
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = 85 - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

#print ('Loading data')
#x, y, vocabulary, vocabulary_inv = load_data()
#x="@MoriTaheripour shut up nigger whore! Hope u get raped by one of those animals. Might change your tune."
x = sys.argv[1]
#print x
x_text = [clean_str(x)]
#print (x_text)
x_text = [s.split(" ") for s in x_text]
#print x_text
sentences_padded = pad_sentences(x_text)
#print (sentences_padded)
vocabulary = np.load('data123-vocab-servertest2.npy').item()
#for word in sentences_padded:
#   for word2 in word:
#       print vocabulary[word2]

x2 = np.array([[vocabulary[word2] for word2 in word]for word in sentences_padded] )
#print x2
#X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)
#print type(vocabulary)
#np.save('data1-vocab.npy', vocabulary)
#sequence_length = x.shape[1]
#vocabulary_size = len(vocabulary_inv)
#embedding_dim = 256
#filter_sizes = [3,4,5]
#num_filters = 512
#drop = 0.5

#nb_epoch = 10
#batch_size = 30
a
model = load_model('cnn2D-data123-multi-servertest2.hdf5')
#print(X_test)
y_pred = model.predict(x2)
cc=['Hate','Offensive','Neutral']

#f = open("outputfile.csv","w+")
for xx in y_pred:
    count=0
    for yy in xx:
        print (cc[count] +" " + str(format(yy*100,'.2f')) + "%")
        #f.write(cc[count] +", " + str(format(yy*100,'.2f')) + "%")
        count = count + 1
#f.close()


#y_pred = y_pred.round()
#y_pred = y_pred.astype('int')
#print(y_pred)
#print(y_test)
#y_pred1 = model.predict(X_test).argmax(axis=1)
#y_test1 = y_test.argmax(axis =1)
#print(y_pred1)
#print(y_test1)
#score = model.evaluate(X_test, y_test,verbose=1)
#report = classification_report( y_test1, y_pred1 )
#print(report)
#cm = confusion_matrix(y_test1, y_pred1)
#print(cm)
#precision_score(y_test, y_pred)
#print(score)

word2vec test

aimport gensim.models.keyedvectors as word2vec
import numpy as np

model=word2vec.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin',binary=True)
def get_score(tokens,ground_truth):
    sc2=[]
    for tk in tokens:
        i =0
        sc=[]
        for tk_2 in ground_truth:
            sim =model.similarity(tk,tk_2)
        #print(i, sim)
            sc.append(sim)
            i = i + 1
        sc=np.array(sc)
        sc2.append(np.max(sc,axis=0))
    return sc2
def get_softP(pred,tokens,tokens_scores):
    i=1
    sum1=0
    for (tk,tk_score) in zip(tokens,token_scores):
        #print (tk, tk_score)
         if tk in pred:
            sum1= sum1 + tk_score
            i = i +1
            print (tk, tk_score)
        #print ("Sum is ",sum1)
    #print sum1/i
    return sum1/i

def get_softR(pred,ground_truth,tokens,tokens_scores):
   
    sum1=0
    for (tk,tk_score) in zip(tokens,token_scores):
        #print (tk, tk_score)
        if tk in pred:
             if tk in ground_truth:
                sum1= sum1 + tk_score
                print (tk, tk_score)
        #print ("Sum is ",sum1)
    #print sum1/i
    return sum1/len(ground_truth)

ground_truth=['Finland','University','UEF','Joensuu']
tokens = ['Finland','Departmet','School','Computing','University', 'UEF', 'Science', 'Park', 'Joensuu']
pred=['Finland','School','Park','Joensuu','Computing']

token_scores= get_score(tokens,ground_truth)
get_softP(pred,tokens,token_scores)
get_softR(pred,ground_truth,tokens,token_scores)

getting byte image (byte code from malware detection

from math import log
import numpy as np

def byte_make_image(byte_code):
    img_array=[]
    for row in byte_code:
        xx=row.split()
        if len(xx)!=17:
            continue
        img_array.append([int(i,16) if i!='??' else 0 for i in xx[1:] ])
    img_array = np.array(img_array)
    if img_array.shape[1]!=16:
        assert(False)
    b=int((img_array.shape[0]*16)**(0.5))
    b=2**(int(log(b)/log(2))+1)
    a=int(img_array.shape[0]*16/b)
    img_array=img_array[:a*b/16,:]
    img_array=np.reshape(img_array,(a,b))
    #img_array = np.uint8(img_array)
    #im = Image.fromarray(img_array)
    return img_array


img = byte_make_image(byte_code)

hate php file

<!DOCTYPE html>
<html>
<title>Hate Speech Detection</title>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css">
<link rel="stylesheet" href="https://www.w3schools.com/lib/w3-theme-blue-grey.css">
<link rel='stylesheet' href='https://fonts.googleapis.com/css?family=Open+Sans'>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
<style>
html,body,h1,h2,h3,h4,h5 {font-family: "Open Sans", sans-serif}
textarea {
  width: 100%;
 
}
</style>
<body class="w3-theme-l5">

<!-- Navbar -->
<div class="w3-top">
 <div class="w3-bar w3-theme-d2 w3-left-align w3-large">
  <a class="w3-bar-item w3-button w3-hide-medium w3-hide-large w3-right w3-padding-large w3-hover-white w3-large w3-theme-d2" href="javascript:void(0);" onclick="openNav()"><i class="fa fa-bars"></i></a>
  <a href="#" class="w3-bar-item w3-button w3-padding-large w3-theme-d4"><i class="fa fa-home w3-margin-right"></i>Hate Speech Detection</a>
 
  <a href="#" class="w3-bar-item w3-button w3-hide-small w3-right w3-padding-large w3-hover-white" title="My Account">
    <img src="mypic.jpg" class="w3-circle" style="height:30px;width:30px" alt="Avatar">
  </a>
 </div>
</div>

<!-- Navbar on small screens -->
<div id="navDemo" class="w3-bar-block w3-theme-d2 w3-hide w3-hide-large w3-hide-medium w3-large">
  <a href="#" class="w3-bar-item w3-button w3-padding-large">Link 1</a>
  <a href="#" class="w3-bar-item w3-button w3-padding-large">Link 2</a>
  <a href="#" class="w3-bar-item w3-button w3-padding-large">Link 3</a>
  <a href="#" class="w3-bar-item w3-button w3-padding-large">Muhammad U. S. Khan</a>
</div>

<!-- Page Container -->
<div class="w3-container w3-content" style="max-width:1400px;margin-top:80px">   
  <!-- The Grid -->
  <div class="w3-row">
    <!-- Left Column -->
    <div class="w3-col m3">
      <!-- Profile -->
      <div class="w3-card w3-round w3-white">
        <div class="w3-container">
         <h4 class="w3-center">Muhammad U. S. Khan</h4>
         <p class="w3-center"><img src="mypic.jpg" class="w3-circle" style="height:106px;width:106px" alt="Avatar"></p>
         <hr>
         <p><i class="fa fa-pencil fa-fw w3-margin-right w3-text-theme"></i> Postdoc Researcher, UEF</p>
         <p><i class="fa fa-home fa-fw w3-margin-right w3-text-theme"></i> Joensuu, Finland</p>
         <!--<p><i class="fa fa-birthday-cake fa-fw w3-margin-right w3-text-theme"></i> </p>-->
        </div>
      </div>
      <br>
     
      <!-- Accordion -->
      <div class="w3-card w3-round">
        <div class="w3-white">
          <button onclick="myFunction('Demo1')" class="w3-button w3-block w3-theme-l1 w3-left-align"><i class="fa fa-circle-o-notch fa-fw w3-margin-right"></i> My Groups</button>
          <div id="Demo1" class="w3-hide w3-container">
            <p>Machine learning group</p>
          </div>
          <button onclick="myFunction('Demo2')" class="w3-button w3-block w3-theme-l1 w3-left-align"><i class="fa fa-calendar-check-o fa-fw w3-margin-right"></i> My Events</button>
          <div id="Demo2" class="w3-hide w3-container">
            <p>Something big in the future</p>
          </div>
         
        </div>     
      </div>
      <br>
     
      <!-- Interests -->
      <div class="w3-card w3-round w3-white w3-hide-small">
        <div class="w3-container">
          <p>Interests</p>
          <p>
            <span class="w3-tag w3-small w3-theme-d5">Machine learning</span>
            <span class="w3-tag w3-small w3-theme-d4">Artificial Intelligence</span>
            <span class="w3-tag w3-small w3-theme-d3">Big data</span>
            <span class="w3-tag w3-small w3-theme-d2">Social media analysis</span>
          </p>
        </div>
      </div>
      <br>
     
      <!-- Alert Box -->
 
    <!-- End Left Column -->
    </div>
   
    <!-- Middle Column -->
    <div class="w3-col m7">
   
      <div class="w3-row-padding">
        <div class="w3-col m12">
          <div class="w3-card w3-round w3-white">
            <div class="w3-container w3-padding">
              <h6 class="w3-opacity">Hate Speech Detection</h6>
  <form action="index.php">
              <textarea class="w3-border w3-padding" rows="4" cols="50" name="nString" >Enter Text</textarea>
             
  <input type="submit" class="w3-button w3-theme" value="Check hate probability">
  </form>
            </div>
          </div>
        </div>
      </div>
     
      <div class="w3-container w3-card w3-white w3-round w3-margin"><br>
       
        <span class="w3-right w3-opacity">1 min</span>
        <h4>Result</h4><br>
        <hr class="w3-clear">
        <p>
<?php

if(isset($_GET['nString'])){
//#print $_GET["nString"]; print "<br>";
$str = $_GET["nString"];
$retval = NULL;
$output = NULL;

    exec("python3 /home/tko/usman/web-docs/hate/cnn2d-model2-dataset1-loadvocab-2.py $str 2>&1",$output, $retval);
 
   print '<table align="center">';
   $length = count($output);
for ($i = 1; $i < $length; $i++) {
  print '<tr><td>';
  print $output[$i];
  print "<br>";
  print '</td></tr>';
}
  print '</table >';
}
else{
   print ""
}
?>

</p>
          <div class="w3-row-padding" style="margin:0 -16px">
           
        </div>
       
      </div>
     
     
     
    <!-- End Middle Column -->
    </div>
   
    <!-- Right Column -->
    <div class="w3-col m2">
      <div class="w3-card w3-round w3-white ">
        <div class="w3-container">
          <p>Tool is based on a neural network model</p>
          <p><strong>that find probabilities of</strong></p>
          <p>hate, offensive, and neutrality of any given english language tweet</p>
          <!--<p><button class="w3-button w3-block w3-theme-l4">Info</button></p>-->
        </div>
      </div>
      <br>
     
     
     
     
      <br>
     
      <!--<div class="w3-card w3-round w3-white w3-padding-32 w3-center">
        <p><i class="fa fa-bug w3-xxlarge"></i></p>
      </div>-->
     
    <!-- End Right Column -->
    </div>
   
  <!-- End Grid -->
  </div>
 
<!-- End Page Container -->
</div>
<br>

<!-- Footer -->
<footer class="w3-container w3-theme-d3 w3-padding-16">

</footer>

<footer class="w3-container w3-theme-d5">
 
</footer>

<script>
// Accordion
function myFunction(id) {
    var x = document.getElementById(id);
    if (x.className.indexOf("w3-show") == -1) {
        x.className += " w3-show";
        x.previousElementSibling.className += " w3-theme-d1";
    } else {
        x.className = x.className.replace("w3-show", "");
        x.previousElementSibling.className =
        x.previousElementSibling.className.replace(" w3-theme-d1", "");
    }
}

// Used to toggle the menu on smaller screens when clicking on the menu button
function openNav() {
    var x = document.getElementById("navDemo");
    if (x.className.indexOf("w3-show") == -1) {
        x.className += " w3-show";
    } else {
        x.className = x.className.replace(" w3-show", "");
    }
}
</script>

</body>
</html>

word2vec server

import gensim.models.keyedvectors as word2vec
import numpy as np
import socket
import sys


model=word2vec.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin',binary=True)


# Create a TCP/IP socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_address = ('localhost', 10004)
print (sys.stderr, 'starting up on %s port %s' % server_address)
sock.bind(server_address)
sock.listen(1)

while True:
    # Wait for a connection
    print (sys.stderr, 'waiting for a connection')
    connection, client_address = sock.accept()
    try:
        print (sys.stderr, 'connection from', client_address)

        # Receive the data in small chunks and retransmit it
        while True:
            data = connection.recv(1001)
            print (sys.stderr, 'received ', data)
            if data:
                print (sys.stderr, 'sending data back to the client')
                data = data.decode("utf-8")
                words = data.split(',')
                try:
                    result=model.similarity(words[0],words[1])
                except KeyError:
                    result=-1
                result = str(result) + "\n"
                print(str(result))
                connection.sendall(bytes(result, "utf-8"))
            else:
                print (sys.stderr, 'no more data from', client_address)
                break
           
    finally:
        # Clean up the connection
        connection.close()

plots with rotation in text

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#reading the data

test_data = pd.read_csv('recall.csv')

print(test_data)

#print(test_data['Sensor'])
y_pos = np.arange(len(test_data['Sensors']))
plt.bar(y_pos,test_data['Value'], align='center', alpha=1,color='black')
plt.xticks(y_pos, test_data['Sensors'],rotation=45)
plt.ylim(.85,1.0)
plt.show()

Frequency percentage calculation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#reading the data

test_data = pd.read_csv('testdata.csv')

sorted_data=test_data.sort_values(by='Data',ascending=True)

actual_frequencey_percentage=[]
prev= 0
totaldata= sorted_data.count()

cd=1/totaldata
print cd
for row in sorted_data.iterrows():
    print row[1][0]
    new = cd + prev
    prev = new
    actual_frequencey_percentage.append(new)
   
print(actual_frequencey_percentage)

plt.plot(sorted_data, actual_frequencey_percentage)
plt.show()

alpha evaluation for multilabel

from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#reading the data



def alpha_evaluation(predicted_df,test_df,alpha=0,beta=1,gamma=1):
    y_pred=predicted_df
    y_test = test_df
    y_pred[y_pred>=0.5] = 1
    y_pred[y_pred<0.5] = 0

    Mx=0  # number of misclassified
    Fx=0  # number of false predicted 

#print type(y_pred)
#print len(y_pred.index)
#print y_pred
#print y_test
#alpha=0
#beta=1
#gamma =1
    pred_class = [0] * len(y_pred.columns)
    test_class = [0] * len(y_test.columns)
    pscore_class = [0] * len(y_test.columns)
    rscore_class = [0] * len(y_test.columns)
    score_row = [0] * len(y_pred.index)

    #print ("pred_class is ", pred_class)
    for i in range(0, len(y_pred.index)):
        Mx=0
        Fx=0
        Sx =0
        Cx=0
        score =0
        selected_class=0
    # print("Row is ",i)
        for j in range(0, len(y_pred.columns)):
            #print ("Column is ",j)
            if y_pred.values[i,j] == 1:
                pred_class[j] +=1
                selected_class = j
            if y_test.values[i,j] == 1:
                test_class[j] +=1
                    #print (y_pred.values[i,j], y_test.values[i,j])
            if y_pred.values[i,j] == 0 and y_test.values[i,j] == 1:
                Mx += 1
                #print ("mx")
            elif  y_pred.values[i,j] == 1 and y_test.values[i,j] == 0:
                Fx += 1
                #print ("fx")
            elif y_pred.values[i,j] == 1 and y_test.values[i,j] == 1:
                Sx +=1
                #print ("Sx")
            if y_pred.values[i,j] == 1 or y_test.values[i,j] == 1:
                Cx +=1
                #print ("Cx")
        print (Mx, Fx,Sx,Cx)
       
        if Sx == 0:
            score =0
        else:
            temp = 1 - (((beta * Mx) + (gamma *Fx))/Cx)
            #print("fx is ",Fx)
            #print("gama into fx  and divided by 2 is ",(gamma * Fx)/2)
            #print ("temp is ",temp)
            score = pow(temp,alpha)
        #print ("score is ",score)
       
        score_row[i] +=score
    for i in range(0, len(y_pred.index)):
        for j in range(0, len(y_pred.columns)):
            if y_pred.values[i,j] == 1:
                pscore_class[j] += score_row[i]
            if y_test.values[i,j] == 1:
                rscore_class[j] += score_row[i]
    #print ("pscore_class is ", pscore_class)
    #print ("rscore_class is ", rscore_class)
    #print ("pred_class is ", pred_class) 
    #print ("test_class is ", test_class)
   
    print(" class,  precision, recall")
    for j in range(0, len(y_pred.columns)):
        print(j, pscore_class[j]/pred_class[j], rscore_class[j]/test_class[j])
    del y_pred
    del y_test
   

y_pred1 = pd.read_csv('y_pred1.csv',header=None)
y_test1 = pd.read_csv('y_test.csv',header=None)

alpha_evaluation(y_pred1,y_test1,alpha=1,beta = 1/4)

distance between vectors

v2 = np.loadtxt("myvector2.txt")
def euclidean_dist(vec1,vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))

def find_closest(word_index, vectors):
    min_dist = 100000
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector)< min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index

print(int2diag[find_closest(diag2int['I63.4'],v2)])
print(int2diag[find_closest(diag2int['F71.1'],v2)])
print(int2diag[find_closest(diag2int['R00.2'],v2)])

euclidean_dist(v2[diag2int['I63.4']],v2[diag2int['S82.9']])