파이썬 텍스트 네트워크 분석 - paisseon tegseuteu neteuwokeu bunseog

전에 UCINET으로 네트워크 시각화 하는 방법에 대해서 적었는데, 이번에는 아예 Python으로 동시출현단어 쌍을 만들고 -> Gephi용 확장자인 graphml로 output을 산출하는 과정에 대해서 적어보고자 한다.

동시출현 단어쌍 만들기

먼저 pandas를 불러온 후에, readlines 명령어로 미리 형태소분석이 된 데이터를 불러온다.

import pandas as pd
f = open("파일 위치/파일명.txt", encoding='UTF8')
lines = f.readlines() #라인으로 불러오기
len(lines) #라인 갯수 알아보기
lines #데이터 잘 불러와졌는지 확인

word_list를 만들어서 lines의 데이터를 넣어준다. (append)

words_list=[]
for text in lines: #list에 있는 lines데이터를 하나씩 불러올 때 text로 불러오는데 탭 구분으로 strip(앞뒤 공백 없애기) 해라
    words_list.append(text.split('\t')[2].strip())
words_list[0] #확인하기

우선 동시출현 단어쌍을 만드는 방법에 대해서 이해해보자. temp2에 단어 4개를 집어넣은 다음에, 아래와 같이 for문을 돌려보자.

temp2 = ["빠빠","엄마","누나","나"]
count = {} #이 괄호는 dictionary 
for i, a in enumerate(temp2): #i(0,1,2,3)에는 인덱스가 나오고 a에는 temp2의 빠빠","엄마","누나","나"가 나옴
    for b in temp2[i+1:]: #b에는 i에 하나씩 더해서 비교함  
        #if a == b: continue   #같은 단어의 경우는 세지 않음
        
        #[가,나]와 [나,가]를 같은 것으로 취급 (정렬하고 싶다) 카운트에서
        if a>b: #가나다 순으로 안 되어있으면 b, a로 바꿔서 keyvalue값을 넣어줘라
            count[b, a] = count.get((b, a),0) + 1  #key값이 없으면 디폴트로 0 넣기
        else :#가나다 순으로 되어 있으면 그대로 가면 됨 
            count[a, b] = count.get((a, b),0) + 1

count 딕셔너리를 조회해보면

count

아래와 같이 4C2 (조합ㅎㅎ)인 6개 동시출현쌍에서 동시출현 횟수를 알 수 있다!

count = {}   #동시출현 빈도가 저장될 dict
for line in words_list:
    #하나의 문서에서 동일한 단어가 두번 나와도 두번의 동시출현으로 고려X
    words = list(set(line.split()))   
    #한줄씩 읽어와서 단어별로 분리(unique한 값으로 받아오기)
    #split은 띄어쓰기를 단어로 구분하라는 함수 
    
    for i, a in enumerate(words):
        for b in words[i+1:]:
            if a>b: 
                count[b, a] = count.get((b, a),0) + 1  
            else :
                count[a, b] = count.get((a, b),0) + 1

set(["아빠","아빠","엄마"])

count.get(("a", "b"),0) #a, b라는 key가 없을 때는 디폴트를 0으로 해라 
count

#dictionary형 자료형을 판다스 데이터프레임으로 만들어줌 
#orient=index를 넣어야 행으로 쭉 나열이 됨 
df=pd.DataFrame.from_dict(count, orient='index')
df.head()

list1=[]
for i in range(len(df)):
    #index를 중심으로 계속 중첩해서 list에 넣는다 
    list1.append([df.index[i][0],df.index[i][1],df[0][i]])

#pandas 이용해서 df형태로 만들기 
df2=pd.DataFrame(list1, columns=["term1","term2","freq"])

#pandas 이용해서 sorting 하기 (디폴트가 오름차순이라서 false 꼭 써줘야 내림차순으로 나옴)
df3=df2.sort_values(by=['freq'],ascending=False)

df3.head(100)

Networkx패키지로 네트워크 분석하기

#추천 참고자료

박건영. 2019. Networkx를 활용한 네트워크 분석 기법 기초 입문

import numpy as np
import networkx as nx
import operator

#np.where는 조건문 만드는 것: (슬라이싱) 빈도가 5개 이상인 것만 잘라내면 1027개가 나온다. (참인 조건의 인덱스 추출)
len((np.where(df3['freq']>=5))[0])

G=nx.Graph()
for i in range(1027):
    #print(pair)
    
    G.add_edge(df3['term1'][i], df3['term2'][i], weight=int(df3['freq'][i]))

# Compute centralities for nodes.
# The degree centrality values are normalized by dividing by the maximum possible degree in a simple graph n-1 where n is the number of nodes in G.
dgr = nx.degree_centrality(G)
btw = nx.betweenness_centrality(G)
cls = nx.closeness_centrality(G)

# itemgetter(0): key 또는 itemgetter(1): value로 sort key, reverse=True (descending order)
sorted_dgr = sorted(dgr.items(), key=operator.itemgetter(1), reverse=True)
sorted_btw = sorted(btw.items(), key=operator.itemgetter(1), reverse=True)
sorted_cls = sorted(cls.items(), key=operator.itemgetter(1), reverse=True)

print("** degree **")
for x in range(20):
    print(sorted_dgr[x])

print("** betweenness **")
for x in range(20):
    print(sorted_btw[x])

print("** closeness **")
for x in range(20):
    print(sorted_cls[x])

#단어끼리 서로 빈도를 세는 데이터셋을 만들었을 때 Gaphi로 시각화하는 것 전단계: graphml 확장자 형식으로 만들기
class MakeGraphml:

    def make_graphml(self, pair_file, graphml_file):
        out = open(graphml_file, 'w', encoding = 'utf-8')

        entity = []
        e_dict = {}
        count = []
        for i in range(len(pair_file)):
            e1 = pair_file.iloc[i,0]
            e2 = pair_file.iloc[i,1]
            #frq = ((word_dict[e1], word_dict[e2]),  pair.split('\t')[2])
            frq = ((e1, e2), pair_file.iloc[i,2])
            if frq not in count: count.append(frq)   # ((a, b), frq)
            if e1 not in entity: entity.append(e1)
            if e2 not in entity: entity.append(e2)
        print('# terms: %s'% len(entity))

        #create e_dict {entity: id} from entity
        for i, w in enumerate(entity):
            e_dict[w] = i + 1 # {word: id}

        out.write(
            "<?xml version=\"1.0\" encoding=\"UTF-8\"?><graphml xmlns=\"http://graphml.graphdrawing.org/xmlns\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://graphml.graphdrawing.org/xmlnshttp://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd\">" +
            "<key id=\"d1\" for=\"edge\" attr.name=\"weight\" attr.type=\"double\"/>" +
            "<key id=\"d0\" for=\"node\" attr.name=\"label\" attr.type=\"string\"/>" +
            "<graph id=\"Entity\" edgedefault=\"undirected\">" + "\n")

        # nodes
        for i in entity:
            out.write("<node id=\"" + str(e_dict[i]) +"\">" + "\n")
            out.write("<data key=\"d0\">" + i + "</data>" + "\n")
            out.write("</node>")

        # edges
        for y in range(len(count)):
            out.write("<edge source=\"" + str(e_dict[count[y][0][0]]) + "\" target=\"" + str(e_dict[count[y][0][1]]) + "\">" + "\n")
            out.write("<data key=\"d1\">" + str(count[y][1]) + "</data>" + "\n")

            #out.write("<edge source=\"" + str(count[y][0][0]) + "\" target=\"" + str(count[y][0][1]) +"\">"+"\n")
            #out.write("<data key=\"d1\">" + str(count[y][1]) +"</data>"+"\n")
            out.write("</edge>")

        out.write("</graph> </graphml>")
        print('now you can see %s' % graphml_file)

        #pairs.close()
        out.close()

gm = MakeGraphml()

graphml_file = '파일명.graphml'

#iloc는 인덱스 index of location 열에서 : 써야 함 (열 전체 보여주려면)
gm.make_graphml(df3.iloc[0:1027,:], graphml_file)

f.close()

파이썬 텍스트 네트워크 분석 - paisseon tegseuteu neteuwokeu bunseog

관련 게시물

Async 비동기 - Async bidong-gi

TIP120 사용법 - TIP120 sayongbeob

아두이노 블루투스 미세먼지 센서값 전송 - adu-ino beullutuseu misemeonji senseogabs jeonsong

Python glob 사용법 - Python glob sayongbeob

자바 슈팅 게임 만들기 - jaba syuting geim mandeulgi

다 익스트라 알고리즘 최단 경로 C++ - da igseuteula algolijeum choedan gyeonglo C++

쇼핑몰 주문 DB - syopingmol jumun DB

자바 게임 코드 - jaba geim kodeu

MySQL CASCADE 확인 - MySQL CASCADE hwag-in

세마포어 사용 이유 - semapo-eo sayong iyu

광고하는

최근 소식

주민등록번호 로 휴대폰 번호 찾기 - jumindeunglogbeonho lo hyudaepon beonho chajgi

아기는 몇살까지? - agineun myeochsalkkaji?

섬머 포켓 스위치 - seommeo pokes seuwichi

사진 수염 제거 어플 - sajin suyeom jegeo eopeul

복면가 왕 인디언 인형 누구야? - bogmyeonga wang indieon inhyeong nuguya?

마법천자문 전투력 순위 - mabeobcheonjamun jeontulyeog sun-wi

롤20 브금 매크로 - lol20 beugeum maekeulo

아동학대예방교육 가정통신문 - adonghagdaeyebang-gyoyug gajeongtongsinmun

Ezpdf 여백 없이 인쇄 - Ezpdf yeobaeg eobs-i inswae

온 세상 창조주 ppt - on sesang changjoju ppt

광고하는

포퓰러

광고하는

에 대한

합법적인

돕다

사회의