# p r 6 
 
 
 
 i m p o r t   p a n d a s   a s   p d     #   N e e d e d   f o r   r e a d i n g   d a t a s e t 
 
 
 
 d a t a   =   p d . r e a d _ c s v ( " S M S S p a m C o l l e c t i o n " ,   s e p = ' \ t ' ,   n a m e s = [ " l a b e l " ,   " m e s s a g e " ] ) 
 
 d a t a [ ' l a b e l _ n u m ' ]   =   d a t a [ ' l a b e l ' ] . m a p ( { ' h a m ' :   0 ,   ' s p a m ' :   1 } ) 
 
 
 
 p r i n t ( " D a t a s e t   l o a d e d   s u c c e s s f u l l y ! " ) 
 
 
 
 p r i n t ( d a t a . h e a d ( ) ) 
 
 
 
 d a t a . s h a p e 
 
 
 
 
 
 i m p o r t   r e     #   F o r   t e x t   c l e a n i n g   ( r e g e x ) 
 
 i m p o r t   n l t k     #   F o r   t o k e n i z a t i o n ,   s t o p w o r d s ,   s t e m m i n g 
 
 f r o m   n l t k . c o r p u s   i m p o r t   s t o p w o r d s 
 
 f r o m   n l t k . s t e m   i m p o r t   P o r t e r S t e m m e r 
 
 f r o m   n l t k . t o k e n i z e   i m p o r t   w o r d _ t o k e n i z e 
 
 
 
 n l t k . d o w n l o a d ( ' p u n k t ' ) 
 
 n l t k . d o w n l o a d ( ' p u n k t _ t a b ' ) 
 
 n l t k . d o w n l o a d ( ' s t o p w o r d s ' ) 
 
 
 
 s t o p _ w o r d s   =   s t o p w o r d s . w o r d s ( ' e n g l i s h ' )     
 
 s t e m m e r   =   P o r t e r S t e m m e r ( )     
 
 d e f   c l e a n _ t e x t _ s i m p l e ( t e x t ) : 
 
         t e x t   =   t e x t . l o w e r ( )     #   l o w e r c a s e 
 
         t e x t   =   r e . s u b ( r " h t t p \ S + | w w w \ S + " ,   " " ,   t e x t )     
 
         t e x t   =   r e . s u b ( r " [ ^ a - z \ s ] " ,   " " ,   t e x t )     
 
         w o r d s   =   w o r d _ t o k e n i z e ( t e x t )   
 
         w o r d s   =   [ s t e m m e r . s t e m ( w )   f o r   w   i n   w o r d s   i f   w   n o t   i n   s t o p _ w o r d s ]     
 
         r e t u r n   "   " . j o i n ( w o r d s ) 
 
 
 
 d a t a [ " c l e a n _ m s g " ]   =   d a t a [ " m e s s a g e " ] . a p p l y ( c l e a n _ t e x t _ s i m p l e ) 
 
 
 
 p r i n t ( d a t a [ [ " m e s s a g e " ,   " c l e a n _ m s g " ] ] . h e a d ( ) ) 
 
 
 
 
 
 f r o m   s k l e a r n . m o d e l _ s e l e c t i o n   i m p o r t   t r a i n _ t e s t _ s p l i t 
 
 
 
 X   =   d a t a [ " c l e a n _ m s g " ] 
 
 y   =   d a t a [ " l a b e l _ n u m " ] 
 
 
 
 X _ t r a i n ,   X _ t e s t ,   y _ t r a i n ,   y _ t e s t   =   t r a i n _ t e s t _ s p l i t ( 
 
         X ,   y ,   t e s t _ s i z e = 0 . 2 ,   r a n d o m _ s t a t e = 4 2 ,   s t r a t i f y = y 
 
 ) 
 
 
 
 
 
 
 
 f r o m   s k l e a r n . f e a t u r e _ e x t r a c t i o n . t e x t   i m p o r t   T f i d f V e c t o r i z e r 
 
 
 
 t f i d f   =   T f i d f V e c t o r i z e r ( m a x _ f e a t u r e s = 3 0 0 0 )   
 
 X _ t r a i n _ t f i d f   =   t f i d f . f i t _ t r a n s f o r m ( X _ t r a i n )     
 
 X _ t e s t _ t f i d f   =   t f i d f . t r a n s f o r m ( X _ t e s t ) 
 
 
 
 
 
 f r o m   s k l e a r n . n a i v e _ b a y e s   i m p o r t   M u l t i n o m i a l N B 
 
 
 
 n b   =   M u l t i n o m i a l N B ( ) 
 
 n b . f i t ( X _ t r a i n _ t f i d f ,   y _ t r a i n ) 
 
 y _ p r e d   =   n b . p r e d i c t ( X _ t e s t _ t f i d f ) 
 
 
 
 
 
 f r o m   s k l e a r n . m e t r i c s   i m p o r t   a c c u r a c y _ s c o r e ,   p r e c i s i o n _ s c o r e ,   r e c a l l _ s c o r e ,   f 1 _ s c o r e ,   c o n f u s i o n _ m a t r i x ,   c l a s s i f i c a t i o n _ r e p o r t 
 
 
 
 p r i n t ( " A c c u r a c y : " ,   a c c u r a c y _ s c o r e ( y _ t e s t ,   y _ p r e d ) ) 
 
 p r i n t ( " P r e c i s i o n : " ,   p r e c i s i o n _ s c o r e ( y _ t e s t ,   y _ p r e d ) ) 
 
 p r i n t ( " R e c a l l : " ,   r e c a l l _ s c o r e ( y _ t e s t ,   y _ p r e d ) ) 
 
 p r i n t ( " F 1   S c o r e : " ,   f 1 _ s c o r e ( y _ t e s t ,   y _ p r e d ) ) 
 
 p r i n t ( " C o n f u s i o n   M a t r i x : \ n " ,   c o n f u s i o n _ m a t r i x ( y _ t e s t ,   y _ p r e d ) ) 
 
 p r i n t ( " C l a s s i f i c a t i o n   R e p o r t : \ n " ,   c l a s s i f i c a t i o n _ r e p o r t ( y _ t e s t ,   y _ p r e d ,   t a r g e t _ n a m e s = [ " H A M " , " S P A M " ] ) ) 