- 01
- 02
- 03
- 04
- 05
- 06
- 07
- 08
- 09
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
# Дамп базы хуза
# Постобработка export-а из MySQL
import pandas as pd
import numpy as np
import csv
comments = pd.read_csv('/wp_comments-2.csv', header=None)
comments.head()
##
comments_clean = pd.DataFrame({
'comment_id': comments[0],
'comment_post_id': comments[1],
'comment_parent': comments[13],
'name': comments[2],
'gravatar_hash': comments[3].str.split('@').str[0],
'gravatar_domain': comments[3].str.split('@').str[1],
'profile': comments[4],
'date': comments[6],
'content': comments[8],
})
##
comments_clean[(comments_clean.gravatar_domain != 'lo.ol') & ~comments_clean.gravatar_domain.isna()]
##
comments_clean = comments_clean[(comments_clean.gravatar_domain == 'lo.ol') | comments_clean.gravatar_domain.isna()]
comments_clean.drop(columns=['gravatar_domain'], inplace=True)
comments_clean.head()
##
posts = pd.read_csv('/wp_posts.csv', header=None)
posts = posts[(posts[20] == 'post') & (posts[7] == 'publish') & (posts[11].str.match('^_'))]
posts.head()
##
posts_clean = pd.DataFrame({
'post_id': posts[0],
'date': posts[2],
'content': posts[4],
'description': posts[6],
'original_id': posts[11].str[1:]
})
posts_clean.head()
##
comments_clean.to_csv('~/Downloads/gost/comments.csv', index=False)
posts_clean.to_csv('~/Downloads/gost/posts.csv', index=False)