จัดเรียงสตริงในคอลัมน์และพิมพ์กราฟ

ฉันมีดาต้าเฟรม แต่สตริงทั้งหมดซ้ำกัน และเมื่อฉันลองพิมพ์กราฟ มันมีคอลัมน์ที่ซ้ำกัน ฉันพยายามลบมัน แต่กราฟของฉันก็พิมพ์ไม่ถูกต้อง CSV ของฉันที่นี่

ดาต้าเฟรม common_users:

     used_at  common users                     pair of websites
0       2014          1364                   avito.ru and e1.ru
1       2014          1364                   e1.ru and avito.ru
2       2014          1716                 avito.ru and drom.ru
3       2014          1716                 drom.ru and avito.ru
4       2014          1602                 avito.ru and auto.ru
5       2014          1602                 auto.ru and avito.ru
6       2014           299           avito.ru and avtomarket.ru
7       2014           299           avtomarket.ru and avito.ru
8       2014           579                   avito.ru and am.ru
9       2014           579                   am.ru and avito.ru
10      2014           602             avito.ru and irr.ru/cars
11      2014           602             irr.ru/cars and avito.ru
12      2014           424       avito.ru and cars.mail.ru/sale
13      2014           424       cars.mail.ru/sale and avito.ru
14      2014           634                    e1.ru and drom.ru
15      2014           634                    drom.ru and e1.ru
16      2014           475                    e1.ru and auto.ru
17      2014           475                    auto.ru and e1.ru
.....

คุณจะเห็นว่าชื่อของเว็บไซต์กลับรายการ ฉันพยายามเรียงลำดับตาม pair of websites โดยฉันมี KeyError ฉันใช้รหัส

df = pd.read_csv("avito_trend.csv", parse_dates=[2])

def f(df):
    dfs = []
    for x in [list(x) for x in itertools.combinations(df['address'].unique(), 2)]:

        c1 = df.loc[df['address'].isin([x[0]]), 'ID']
        c2 = df.loc[df['address'].isin([x[1]]), 'ID']
        c = pd.Series(list(set(c1).intersection(set(c2))))
        #add inverted intersection c2 vs c1
        c_invert = pd.Series(list(set(c2).intersection(set(c1))))
        dfs.append(pd.DataFrame({'common users':len(c), 'pair of websites':' and '.join(x)}, index=[0]))
        #swap values in x
        x[1],x[0] = x[0],x[1]
        dfs.append(pd.DataFrame({'common users':len(c_invert), 'pair of websites':' and '.join(x)}, index=[0]))
    return pd.concat(dfs)

common_users = df.groupby([df['used_at'].dt.year]).apply(f).reset_index(drop=True, level=1).reset_index()

graph_by_common_users = common_users.pivot(index='pair of websites', columns='used_at', values='common users')
#sort by column 2014
graph_by_common_users = graph_by_common_users.sort_values(2014, ascending=False)

ax = graph_by_common_users.plot(kind='barh', width=0.5, figsize=(10,20))
[label.set_rotation(25) for label in ax.get_xticklabels()]


rects = ax.patches 
labels = [int(round(graph_by_common_users.loc[i, y])) for y in graph_by_common_users.columns.tolist() for i in graph_by_common_users.index] 
for rect, label in zip(rects, labels): 
    height = rect.get_height() 
    ax.text(rect.get_width() + 3, rect.get_y() + rect.get_height(), label, fontsize=8)

plt.show()

กราฟของฉันดูเหมือนว่า:

python pandas matplotlib

ldevyataykina 20.03.2016 แหล่งที่มา

comment

คุณช่วยระบุรายการป้ายกำกับที่คาดหวังได้ไหม เนื่องจากยังไม่ชัดเจนว่าคุณต้องการบรรลุผลอะไร - MaxU 20.03.2016

comment

ตอนนี้ฉันมีปัญหาอื่น ฉันผ่านอาร์เรย์และรับ

rects = ax1.patches labels = ["%d" % i for i in time['time online'].round()] for rect, label in zip(rects, labels):     print rect, label     height = rect.get_height()     ax1.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

ฉันอธิบายปัญหาของฉันใน คำถาม< /ก> - ldevyataykina 20.03.2016

คำตอบ (2)

arrow_upward
1
arrow_downward

ขั้นแรกคุณสามารถเพิ่มคอลัมน์ใหม่ sort ในฟังก์ชัน f จากนั้นจัดเรียงค่าตามคอลัมน์ pair of websites และ drop_duplicates ตามคอลัมน์ used_at และ sort:

import pandas as pd
import itertools

df = pd.read_csv("avito_trend.csv", 
                      parse_dates=[2])


def f(df):
    dfs = []
    i = 0
    for x in [list(x) for x in itertools.combinations(df['address'].unique(), 2)]:
        i += 1
        c1 = df.loc[df['address'].isin([x[0]]), 'ID']
        c2 = df.loc[df['address'].isin([x[1]]), 'ID']
        c = pd.Series(list(set(c1).intersection(set(c2))))
        #add inverted intersection c2 vs c1
        c_invert = pd.Series(list(set(c2).intersection(set(c1))))
        dfs.append(pd.DataFrame({'common users':len(c), 'pair of websites':' and '.join(x), 'sort': i}, index=[0]))
        #swap values in x
        x[1],x[0] = x[0],x[1]
        dfs.append(pd.DataFrame({'common users':len(c_invert), 'pair of websites':' and '.join(x), 'sort': i}, index=[0]))
    return pd.concat(dfs)

common_users = df.groupby([df['used_at'].dt.year]).apply(f).reset_index(drop=True, level=1).reset_index()

common_users = common_users.sort_values('pair of websites')
common_users = common_users.drop_duplicates(subset=['used_at','sort']) 
#print common_users

graph_by_common_users = common_users.pivot(index='pair of websites', columns='used_at', values='common users')
#print graph_by_common_users

#change order of columns
graph_by_common_users = graph_by_common_users[[2015,2014]]
graph_by_common_users = graph_by_common_users.sort_values(2014, ascending=False)

ax = graph_by_common_users.plot(kind='barh', width=0.5, figsize=(10,20))
[label.set_rotation(25) for label in ax.get_xticklabels()]

rects = ax.patches 
labels = [int(round(graph_by_common_users.loc[i, y])) for y in graph_by_common_users.columns.tolist() for i in graph_by_common_users.index] 
for rect, label in zip(rects, labels): 
    height = rect.get_height() 
    ax.text(rect.get_width() + 20, rect.get_y() - 0.25 + rect.get_height(), label, fontsize=8) 

#sorting values of legend
handles, labels = ax.get_legend_handles_labels()
# sort both labels and handles by labels
labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
ax.legend(handles, labels)

กราฟของฉัน:

แก้ไข:

ความคิดเห็นคือ:

ทำไมคุณถึงสร้าง c_invert และ x1,x[0] = x[0] ,x1

เนื่องจากชุดค่าผสมสำหรับปี 2014 และ 2015 แตกต่างกัน - ค่า 4 หายไปในคอลัมน์แรกและ 4 ในคอลัมน์ที่สอง:

used_at                                2015    2014
pair of websites                                   
avito.ru and drom.ru                 1491.0  1716.0
avito.ru and auto.ru                 1473.0  1602.0
avito.ru and e1.ru                   1153.0  1364.0
drom.ru and auto.ru                     NaN   874.0
e1.ru and drom.ru                     539.0   634.0
avito.ru and irr.ru/cars              403.0   602.0
avito.ru and am.ru                    262.0   579.0
e1.ru and auto.ru                     451.0   475.0
avito.ru and cars.mail.ru/sale        256.0   424.0
drom.ru and irr.ru/cars               277.0   423.0
auto.ru and irr.ru/cars               288.0   409.0
auto.ru and am.ru                     224.0   408.0
drom.ru and am.ru                     187.0   394.0
auto.ru and cars.mail.ru/sale         195.0   330.0
avito.ru and avtomarket.ru            205.0   299.0
drom.ru and cars.mail.ru/sale         189.0   292.0
drom.ru and avtomarket.ru             175.0   247.0
auto.ru and avtomarket.ru             162.0   243.0
e1.ru and irr.ru/cars                 148.0   235.0
e1.ru and am.ru                        99.0   224.0
am.ru and irr.ru/cars                   NaN   223.0
irr.ru/cars and cars.mail.ru/sale      94.0   197.0
am.ru and cars.mail.ru/sale             NaN   166.0
e1.ru and cars.mail.ru/sale           105.0   154.0
e1.ru and avtomarket.ru               105.0   139.0
avtomarket.ru and irr.ru/cars           NaN   139.0
avtomarket.ru and am.ru                72.0   133.0
avtomarket.ru and cars.mail.ru/sale    48.0   105.0
auto.ru and drom.ru                   799.0     NaN
cars.mail.ru/sale and am.ru            73.0     NaN
irr.ru/cars and am.ru                 102.0     NaN
irr.ru/cars and avtomarket.ru          73.0     NaN

จากนั้นฉันก็สร้างชุดค่าผสมกลับหัวทั้งหมด - ปัญหาได้รับการแก้ไขแล้ว แต่ทำไมถึงมี NaN? เหตุใดชุดค่าผสมจึงแตกต่างกันใน 2014 และ 2015

ฉันเพิ่มในฟังก์ชัน f:

def f(df):
    print df['address'].unique()

    dfs = []
    i = 0
    for x in [list(x) for x in itertools.combinations((df['address'].unique()), 2)]:
...
...

และเอาต์พุตคือ (เหตุใดการพิมพ์ครั้งแรกสองครั้งจึงอธิบายไว้ใน warning ที่นี่< /ก> ):

['avito.ru' 'e1.ru' 'drom.ru' 'auto.ru' 'avtomarket.ru' 'am.ru'
 'irr.ru/cars' 'cars.mail.ru/sale']
['avito.ru' 'e1.ru' 'drom.ru' 'auto.ru' 'avtomarket.ru' 'am.ru'
 'irr.ru/cars' 'cars.mail.ru/sale']
['avito.ru' 'e1.ru' 'auto.ru' 'drom.ru' 'irr.ru/cars' 'avtomarket.ru'
 'cars.mail.ru/sale' 'am.ru']

ดังนั้นรายการจึงแตกต่างกันและชุดค่าผสมก็แตกต่างกันเช่นกัน -> ฉันได้รับค่า NaN บางส่วน

วิธีแก้ไขคือการเรียงลำดับรายการชุดค่าผสม

def f(df):
    #print (sorted(df['address'].unique()))   
    dfs = []
    for x in [list(x) for x in itertools.combinations(sorted(df['address'].unique()), 2)]:
        c1 = df.loc[df['address'].isin([x[0]]), 'ID']
        ...
        ...

รหัสทั้งหมดคือ:

import pandas as pd
import itertools

df = pd.read_csv("avito_trend.csv", 
                      parse_dates=[2])

def f(df):
    #print (sorted(df['address'].unique()))   
    dfs = []
    for x in [list(x) for x in itertools.combinations(sorted(df['address'].unique()), 2)]:
        c1 = df.loc[df['address'].isin([x[0]]), 'ID']
        c2 = df.loc[df['address'].isin([x[1]]), 'ID']
        c = pd.Series(list(set(c1).intersection(set(c2))))
        dfs.append(pd.DataFrame({'common users':len(c), 'pair of websites':' and '.join(x)}, index=[0]))
    return pd.concat(dfs)

common_users = df.groupby([df['used_at'].dt.year]).apply(f).reset_index(drop=True, level=1).reset_index()
#print common_users

graph_by_common_users = common_users.pivot(index='pair of websites', columns='used_at', values='common users')

#change order of columns
graph_by_common_users = graph_by_common_users[[2015,2014]]
graph_by_common_users = graph_by_common_users.sort_values(2014, ascending=False)
#print graph_by_common_users

ax = graph_by_common_users.plot(kind='barh', width=0.5, figsize=(10,20))
[label.set_rotation(25) for label in ax.get_xticklabels()]

rects = ax.patches 
labels = [int(round(graph_by_common_users.loc[i, y])) \
for y in graph_by_common_users.columns.tolist() \
for i in graph_by_common_users.index]

for rect, label in zip(rects, labels): 
    height = rect.get_height() 
    ax.text(rect.get_width()+20, rect.get_y() - 0.25 + rect.get_height(), label, fontsize=8)

    handles, labels = ax.get_legend_handles_labels()
    # sort both labels and handles by labels
    labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
    ax.legend(handles, labels)

และกราฟ:

jezrael 30.03.2016

comment

เป็นไปได้ไหมที่จะลดตัวเลขลงอีกหน่อย เพราะบางอันก็รวบรวมกัน - ldevyataykina; 30.03.2016

comment

และพิมพ์ 2014 เหนือ 2015? - ldevyataykina; 30.03.2016

comment

โอเค ให้เวลาฉันหน่อย แต่ปัญหาแรกได้รับการแก้ไขแล้ว โปรดดูแก้ไข - jezrael; 30.03.2016

comment

และคุณช่วยเปลี่ยนลำดับการใช่ที่มุมขวาบนได้ไหม 2014 แรกและ 2015 ถัดไป - ldevyataykina; 30.03.2016

comment

ขอบคุณมาก. นี่คือสิ่งที่ฉันต้องการ หากฉันมีคำถามเกี่ยวกับรหัส ฉันขอถามคุณได้ไหม - ldevyataykina; 30.03.2016

comment

โอเค ลองถามดู แต่ตอนนี้ไปเที่ยวแล้วอาจจะตอบทีหลังก็ได้ - jezrael; 30.03.2016

comment

ทำไมคุณถึงสร้าง c_invert และ x[1],x[0] = x[0],x[1] ? - ldevyataykina; 31.03.2016

comment

ขอบคุณสำหรับคำอธิบายของคุณ) มันชัดเจนมาก และคุณบอกได้ไหมว่าคุณทำอะไรเพื่อพัฒนาทักษะของคุณ? - ldevyataykina; 31.03.2016

comment

การเขียนโค้ด การเขียนโค้ด การเขียนโค้ด... :) ศึกษาเอกสารประกอบที่สมบูรณ์แบบของ pandas, ตำราอาหาร ก็สมบูรณ์แบบเช่นกัน และตอบคำถามเกี่ยวกับ pandas บน StackOverflow :) - jezrael; 31.03.2016

arrow_upward
0
arrow_downward

ปัญหาการตั้งค่า DataFrame

ดูเหมือนว่า DataFrame ของคุณไม่มีโครงสร้างตามที่คุณต้องการ DataFrame ของคุณมี 2014 และ 2015 เป็น ชื่อส่วนหัวคอลัมน์ ไม่ เป็น ค่าแถว ในดัชนี used_at นอกจากนี้ used_at ยังเป็น ชื่อดัชนี ไม่ใช่ ป้ายกำกับดัชนี ของแถวแรก

คุณสามารถทดสอบได้ว่าสิ่งนี้เป็นจริงโดยดำเนินการ:

import pandas as pd
from cStringIO import StringIO

text_data = '''
used_at            2014  2015
address                      
am.ru               621   273
auto.ru            1752  1595
avito.ru           5460  4631
avtomarket.ru       314   215
cars.mail.ru/sale   457   271
drom.ru            1934  1623
e1.ru              1654  1359
irr.ru/cars         619   426
'''

# Read in tabular data with used_at row as header
df = pd.read_table(StringIO(text_data), sep='\s+', index_col=0)
print 'DataFrame created with used_at row as header:'
print df
print 

# print df.used_at would cause AttributeError: 'DataFrame' object has no attribute 'used_at'
print 'df columns    :', df.columns
print 'df index name :', df.index.name
print

DataFrame created with used_at row as header:
                   2014  2015
used_at                      
address             NaN   NaN
am.ru               621   273
auto.ru            1752  1595
avito.ru           5460  4631
avtomarket.ru       314   215
cars.mail.ru/sale   457   271
drom.ru            1934  1623
e1.ru              1654  1359
irr.ru/cars         619   426

df columns    : Index([u'2014', u'2015'], dtype='object')
df index name : used_at

tmthydvnprt 20.03.2016

จัดเรียงสตริงในคอลัมน์และพิมพ์กราฟ

คำตอบ (2)

ปัญหาการตั้งค่า DataFrame

คำถามในหัวข้อ