import pandas as pd
import os
import tqdm
EDA
https://www.kaggle.com/robikscube/student-writing-competition-twitch-stream?scriptVersionId=83303421
df = pd.read_csv('/root/.cache/data/train.csv')
|
id |
discourse_id |
discourse_start |
discourse_end |
discourse_text |
discourse_type |
discourse_type_num |
predictionstring |
0 |
423A1CA112E2 |
1.622628e+12 |
8.0 |
229.0 |
Modern humans today are always on their phone…. |
Lead |
Lead 1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1… |
1 |
423A1CA112E2 |
1.622628e+12 |
230.0 |
312.0 |
They are some really bad consequences when stu… |
Position |
Position 1 |
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
2 |
423A1CA112E2 |
1.622628e+12 |
313.0 |
401.0 |
Some certain areas in the United States ban ph… |
Evidence |
Evidence 1 |
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
3 |
423A1CA112E2 |
1.622628e+12 |
402.0 |
758.0 |
When people have phones, they know about certa… |
Evidence |
Evidence 2 |
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9… |
4 |
423A1CA112E2 |
1.622628e+12 |
759.0 |
886.0 |
Driving is one of the way how to get around. P… |
Claim |
Claim 1 |
139 140 141 142 143 144 145 146 147 148 149 15… |
df['discourse_type'].unique()
array(['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
'Counterclaim', 'Rebuttal'], dtype=object)
ids = df['id'].unique()
ids.size
# bad case
# 2726E31ECDC6
an_df = df[df['id'] == 'FFFD0AF13501']
an_df
|
id |
discourse_id |
discourse_start |
discourse_end |
discourse_text |
discourse_type |
discourse_type_num |
predictionstring |
29376 |
FFFD0AF13501 |
1.619824e+12 |
237.0 |
280.0 |
they get to see tons of awesome landmarks. |
Claim |
Claim 1 |
44 45 46 47 48 49 50 51 |
29377 |
FFFD0AF13501 |
1.619824e+12 |
281.0 |
347.0 |
If you love horses and cattle then is most lik… |
Claim |
Claim 2 |
52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
29378 |
FFFD0AF13501 |
1.619824e+12 |
348.0 |
431.0 |
You get to enteract with them and feed them ca… |
Evidence |
Evidence 1 |
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 8… |
29379 |
FFFD0AF13501 |
1.619824e+12 |
431.0 |
516.0 |
Even if you just want to help out your world o… |
Claim |
Claim 3 |
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 9… |
29380 |
FFFD0AF13501 |
1.619824e+12 |
517.0 |
583.0 |
It’s amazing how much stuff you can do there t… |
Claim |
Claim 4 |
102 103 104 105 106 107 108 109 110 111 112 11… |
29381 |
FFFD0AF13501 |
1.619824e+12 |
584.0 |
943.0 |
you might be able to look at the pretty things… |
Evidence |
Evidence 2 |
116 117 118 119 120 121 122 123 124 125 126 12… |
29382 |
FFFD0AF13501 |
1.619824e+12 |
959.0 |
1050.0 |
all i’m saying is that the seagoing cowboys wo… |
Position |
Position 1 |
193 194 195 196 197 198 199 200 201 202 203 20… |
29383 |
FFFD0AF13501 |
1.619824e+12 |
1051.0 |
1245.0 |
You can go so many places and you rarely go to… |
Concluding Statement |
Concluding Statement 1 |
209 210 211 212 213 214 215 216 217 218 219 22… |
def get_instance_with_row(df, idx: int):
row = df.loc[idx].to_dict()
file_name = os.path.join('/root/.cache/data/train', row['id'] + '.txt')
with open(file_name, 'r') as fp:
text = fp.read()
row['text'] = text
pred_ls = row['predictionstring'].split(' ')
row['start_word'], row['end_word'] = int(pred_ls[0]), int(pred_ls[-1])
row['discourse_words'] = ' '.join(text.split()[row['start_word']:row['end_word'] + 1])
row['discourse_chars'] = text[int(row['discourse_start']):int(row['discourse_end'])]
return row
instance = get_instance_with_row(df, 39597)
instance
{'id': 'FFFF80B8CC2F',
'discourse_id': 1617042401315.0,
'discourse_start': 0.0,
'discourse_end': 990.0,
'discourse_text': 'Venus is a planet what belong the System Solar. Venus is the second planet from our sun. Earth, Venus and Mars our other planetry neighbor, orbit the sun at different speeds. Venus is sometimes right around the corner-in space term-humans have sent numerous spacecraft to land ono this cloud-draped word.\n\nIn the atomosphere of almost 97% carbon dioxide blankets Venus. Astronomers are fascinated by Venus because it may well once have been the most Earth-like planet in our solar system. Today go to the univerce is very dangers because not can to breathe and you can not survive.\n\nThe NASA has one particulary compelling idea for seding humans to study Venus .At thirty-plus miles above the surface, temperatures would still be toasty at around 170 degrees Fahrenheit, but the air pressure would be close to that of sesa level on Earth.\n\nNOt can have table or cell phone is a acid or heat capable of melting tin.\n\nThe people are very corious what investigator everything the System Solar.',
'discourse_type': 'Evidence',
'discourse_type_num': 'Evidence 1',
'predictionstring': '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167',
'text': 'Venus is a planet what belong the System Solar. Venus is the second planet from our sun. Earth, Venus and Mars our other planetry neighbor, orbit the sun at different speeds. Venus is sometimes right around the corner-in space term-humans have sent numerous spacecraft to land ono this cloud-draped word.\n\nIn the atomosphere of almost 97% carbon dioxide blankets Venus. Astronomers are fascinated by Venus because it may well once have been the most Earth-like planet in our solar system. Today go to the univerce is very dangers because not can to breathe and you can not survive.\n\nThe NASA has one particulary compelling idea for seding humans to study Venus .At thirty-plus miles above the surface, temperatures would still be toasty at around 170 degrees Fahrenheit, but the air pressure would be close to that of sesa level on Earth.\n\nNOt can have table or cell phone is a acid or heat capable of melting tin.\n\nThe people are very corious what investigator everything the System Solar.',
'start_word': 0,
'end_word': 167,
'discourse_words': 'Venus is a planet what belong the System Solar. Venus is the second planet from our sun. Earth, Venus and Mars our other planetry neighbor, orbit the sun at different speeds. Venus is sometimes right around the corner-in space term-humans have sent numerous spacecraft to land ono this cloud-draped word. In the atomosphere of almost 97% carbon dioxide blankets Venus. Astronomers are fascinated by Venus because it may well once have been the most Earth-like planet in our solar system. Today go to the univerce is very dangers because not can to breathe and you can not survive. The NASA has one particulary compelling idea for seding humans to study Venus .At thirty-plus miles above the surface, temperatures would still be toasty at around 170 degrees Fahrenheit, but the air pressure would be close to that of sesa level on Earth. NOt can have table or cell phone is a acid or heat capable of melting tin. The people are very corious what investigator everything the System Solar.',
'discourse_chars': 'Venus is a planet what belong the System Solar. Venus is the second planet from our sun. Earth, Venus and Mars our other planetry neighbor, orbit the sun at different speeds. Venus is sometimes right around the corner-in space term-humans have sent numerous spacecraft to land ono this cloud-draped word.\n\nIn the atomosphere of almost 97% carbon dioxide blankets Venus. Astronomers are fascinated by Venus because it may well once have been the most Earth-like planet in our solar system. Today go to the univerce is very dangers because not can to breathe and you can not survive.\n\nThe NASA has one particulary compelling idea for seding humans to study Venus .At thirty-plus miles above the surface, temperatures would still be toasty at around 170 degrees Fahrenheit, but the air pressure would be close to that of sesa level on Earth.\n\nNOt can have table or cell phone is a acid or heat capable of melting tin.\n\nThe people are very corious what investigator everything the System Solar.'}
from transformers import BigBirdTokenizerFast
tokenizer = BigBirdTokenizerFast.from_pretrained('allenai/longformer-large-4096')
def mapping_word_to_token(word_ids, word_start, word_end):
token_start, token_end = -1, -1
for idx, word_id in enumerate(word_ids):
if word_id == word_start:
token_start = idx
break
for idx, word_id in enumerate(word_ids):
if word_id == word_end:
token_end = idx
return token_start, token_end
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
(discourse_start, discourse_end) 与 predictionstring时什么关系?
def stat_relation(df):
cnt = 0
stat = {}
err = []
for idx, item in tqdm.tqdm(df.iterrows()):
id_, discourse_id, start, end, discourse_text, type_, type_num, pred_str = item
file_name = os.path.join('/root/.cache/data/train', id_ + '.txt')
with open(file_name, 'r') as fp:
text = fp.read()
#print(discourse_text)
# text using char.
text_with_char = text[int(start):int(end)]
if discourse_text != text_with_char:
text_with_char = text[int(start):int(end) - 1]
if discourse_text[:-1] == text_with_char:
stat['char'] = stat.get('char', 0) + 1
else:
err.append(idx)
else:
stat['char'] = stat.get('char', 0) + 1
cnt += 1
if cnt == 6000000:
break
return stat
stat_relation(df)
统计predstring正确数
def stat_predstring(df):
cnt = 0
stat = {}
err = []
for idx, item in tqdm.tqdm(df.iterrows()):
id_, discourse_id, start_, end_, discourse_text, type_, type_num, pred_str = item
file_name = os.path.join('/root/.cache/data/train', id_ + '.txt')
with open(file_name, 'r') as fp:
text = fp.read()
pred_ls = pred_str.split(' ')
start, end = int(pred_ls[0]), int(pred_ls[-1])
text_pred = text.split()[start:end + 1]
text_pred = ' '.join(text_pred)
if discourse_text == text_pred:
stat['word'] = stat.get('word', 0) + 1
else:
print('='*200)
print(discourse_text)
print()
print(text_pred)
print('='*200)
cnt += 1
if cnt == 50:
break
return stat
Share on:
Twitter
❄ Facebook
❄ Email